Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -12,6 +12,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.utils.torch_utils import get_dtype_size
|
||||
import vllm.envs as envs
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -34,6 +35,25 @@ class KVCacheSpec:
|
||||
The page size
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@property
|
||||
def scale_page_size_bytes(self) -> int:
|
||||
"""
|
||||
The size of a scale page with `block_size` tokens in bytes.
|
||||
|
||||
Returns:
|
||||
The scale page size
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def v_cache_scale_size_bytes(self) -> int:
|
||||
"""
|
||||
The size of a scale page with `block_size` tokens in bytes.
|
||||
|
||||
Returns:
|
||||
The scale page size
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
|
||||
"""
|
||||
@@ -78,13 +98,27 @@ class AttentionSpec(KVCacheSpec):
|
||||
|
||||
@property
|
||||
def real_page_size_bytes(self) -> int:
|
||||
return (
|
||||
2
|
||||
* self.block_size
|
||||
* self.num_kv_heads
|
||||
* self.head_size
|
||||
* get_dtype_size(self.dtype)
|
||||
)
|
||||
if envs.VLLM_ATTN_OPT_LEVEL == 1:
|
||||
# mla 和 i8qi8ki8v 申请的内存一样
|
||||
return 2 * self.block_size * self.num_kv_heads * self.head_size \
|
||||
* get_dtype_size(torch.int8)
|
||||
elif envs.VLLM_ATTN_OPT_LEVEL == 2:
|
||||
# i8qi8kf16v 申请的内存是f16+int8,所以是3
|
||||
return 3 * self.block_size * self.num_kv_heads * self.head_size \
|
||||
* get_dtype_size(torch.int8)
|
||||
return 2 * self.block_size * self.num_kv_heads * self.head_size \
|
||||
* get_dtype_size(self.dtype)
|
||||
@property
|
||||
def scale_page_size_bytes(self) -> int:
|
||||
# For MLA we only store a single latent vector
|
||||
if envs.VLLM_ATTN_OPT_LEVEL > 0:
|
||||
return self.block_size * self.num_kv_heads * get_dtype_size(torch.float32)
|
||||
else:
|
||||
return 0
|
||||
|
||||
@property
|
||||
def v_cache_scale_size_bytes(self) -> int:
|
||||
return self.head_size * self.num_kv_heads * get_dtype_size(torch.float32)
|
||||
|
||||
|
||||
@dataclass(frozen=True, kw_only=True)
|
||||
@@ -118,7 +152,7 @@ class FullAttentionSpec(AttentionSpec):
|
||||
# (max_model_len//dcp_world_size) tokens locally.
|
||||
if dcp_world_size * pcp_world_size > 1:
|
||||
max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size)
|
||||
return cdiv(max_model_len, self.block_size) * self.page_size_bytes
|
||||
return cdiv(max_model_len, self.block_size) * (self.page_size_bytes + self.scale_page_size_bytes)
|
||||
|
||||
@classmethod
|
||||
def merge_window_sizes(cls, window_sizes: set[int]) -> int | None:
|
||||
@@ -179,12 +213,28 @@ class FullAttentionSpec(AttentionSpec):
|
||||
|
||||
@property
|
||||
def real_page_size_bytes(self) -> int:
|
||||
return (
|
||||
self.block_size
|
||||
* self.num_kv_heads
|
||||
* (self.head_size + self.head_size_v)
|
||||
* get_dtype_size(self.dtype)
|
||||
)
|
||||
if envs.VLLM_ATTN_OPT_LEVEL == 1:
|
||||
return (
|
||||
self.block_size
|
||||
* self.num_kv_heads
|
||||
* (self.head_size + self.head_size_v)
|
||||
* get_dtype_size(torch.int8)
|
||||
)
|
||||
elif envs.VLLM_ATTN_OPT_LEVEL == 2:
|
||||
return self.block_size * self.num_kv_heads * self.head_size \
|
||||
* get_dtype_size(torch.int8) + self.block_size * self.num_kv_heads * self.head_size_v \
|
||||
* get_dtype_size(self.dtype)
|
||||
else:
|
||||
return (
|
||||
self.block_size
|
||||
* self.num_kv_heads
|
||||
* (self.head_size + self.head_size_v)
|
||||
* get_dtype_size(self.dtype)
|
||||
)
|
||||
@property
|
||||
def v_cache_scale_size_bytes(self) -> int:
|
||||
return self.head_size_v * self.num_kv_heads * get_dtype_size(torch.float32)
|
||||
|
||||
|
||||
|
||||
@dataclass(frozen=True, kw_only=True)
|
||||
@@ -198,12 +248,30 @@ class MLAAttentionSpec(FullAttentionSpec):
|
||||
# See `vllm/v1/attention/backends/mla/flashmla_sparse.py`
|
||||
# for details.
|
||||
return self.block_size * 656
|
||||
if envs.VLLM_USE_INT8_MLA:
|
||||
return (
|
||||
self.block_size
|
||||
* self.num_kv_heads
|
||||
* self.head_size
|
||||
* get_dtype_size(torch.int8)
|
||||
)
|
||||
return (
|
||||
self.block_size
|
||||
* self.num_kv_heads
|
||||
* self.head_size
|
||||
* get_dtype_size(self.dtype)
|
||||
)
|
||||
@property
|
||||
def scale_page_size_bytes(self) -> int:
|
||||
# For MLA we only store a single latent vector
|
||||
if envs.VLLM_USE_INT8_MLA:
|
||||
return (
|
||||
self.block_size
|
||||
* self.num_kv_heads * 2
|
||||
* get_dtype_size(torch.float32)
|
||||
)
|
||||
else:
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def merge(cls, specs: list[Self]) -> Self:
|
||||
@@ -267,7 +335,7 @@ class SlidingWindowSpec(AttentionSpec):
|
||||
# of the block. For example, if the block size is 4 and num_token
|
||||
# is 4, we need two blocks [XXCD] [EF] to store the sliding
|
||||
# window [CDEF] of 6 tokens.
|
||||
return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
|
||||
return (cdiv(num_tokens, self.block_size) + 1) * (self.page_size_bytes + self.scale_page_size_bytes)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -289,7 +357,6 @@ class MambaSpec(KVCacheSpec):
|
||||
assert self.page_size_padded >= page_size
|
||||
return self.page_size_padded
|
||||
return page_size
|
||||
|
||||
@property
|
||||
def scale_page_size_bytes(self) -> int:
|
||||
return 0
|
||||
@@ -389,6 +456,9 @@ class UniformTypeKVCacheSpecs(KVCacheSpec):
|
||||
@property
|
||||
def page_size_bytes(self) -> int:
|
||||
return sum(spec.page_size_bytes for spec in self.kv_cache_specs.values())
|
||||
@property
|
||||
def scale_page_size_bytes(self) -> int:
|
||||
return sum(spec.scale_page_size_bytes for spec in self.kv_cache_specs.values())
|
||||
|
||||
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
|
||||
max_num_pages = max(
|
||||
@@ -460,6 +530,7 @@ class KVCacheTensor:
|
||||
|
||||
size: int # size of the KV cache tensor in bytes
|
||||
shared_by: list[str] # layer names that share the same KV cache tensor
|
||||
size_scale: int = 0 # size of the v_cache_scale tensor in bytes, only used for VLLM_ATTN_OPT_LEVEL == 1
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -486,6 +557,7 @@ class KVCacheConfig:
|
||||
kv_cache_tensors: list[KVCacheTensor]
|
||||
"""How should model runner initialize the KV cache tensors for each layer"""
|
||||
kv_cache_groups: list[KVCacheGroupSpec]
|
||||
kv_cache_scale_tensors: list[KVCacheTensor]
|
||||
"""
|
||||
The kv cache groups of the model.
|
||||
For models with only one type of attention, there is only one group that
|
||||
@@ -493,3 +565,11 @@ class KVCacheConfig:
|
||||
For models with multiple types of attention, there will be multiple groups,
|
||||
see `_get_kv_cache_config_uniform_page_size` for more details.
|
||||
"""
|
||||
|
||||
@property
|
||||
def has_mamba_layers(self) -> bool:
|
||||
return any(isinstance(g.kv_cache_spec, MambaSpec) for g in self.kv_cache_groups)
|
||||
|
||||
@property
|
||||
def needs_kv_cache_zeroing(self) -> bool:
|
||||
return self.has_mamba_layers
|
||||
|
||||
Reference in New Issue
Block a user