Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -12,6 +12,7 @@ from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import get_dtype_size
import vllm.envs as envs
logger = init_logger(__name__)
@@ -34,6 +35,25 @@ class KVCacheSpec:
The page size
"""
raise NotImplementedError
@property
def scale_page_size_bytes(self) -> int:
"""
The size of a scale page with `block_size` tokens in bytes.
Returns:
The scale page size
"""
raise NotImplementedError
@property
def v_cache_scale_size_bytes(self) -> int:
"""
The size of a scale page with `block_size` tokens in bytes.
Returns:
The scale page size
"""
raise NotImplementedError
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
"""
@@ -78,13 +98,27 @@ class AttentionSpec(KVCacheSpec):
@property
def real_page_size_bytes(self) -> int:
return (
2
* self.block_size
* self.num_kv_heads
* self.head_size
* get_dtype_size(self.dtype)
)
if envs.VLLM_ATTN_OPT_LEVEL == 1:
# mla 和 i8qi8ki8v 申请的内存一样
return 2 * self.block_size * self.num_kv_heads * self.head_size \
* get_dtype_size(torch.int8)
elif envs.VLLM_ATTN_OPT_LEVEL == 2:
# i8qi8kf16v 申请的内存是f16+int8所以是3
return 3 * self.block_size * self.num_kv_heads * self.head_size \
* get_dtype_size(torch.int8)
return 2 * self.block_size * self.num_kv_heads * self.head_size \
* get_dtype_size(self.dtype)
@property
def scale_page_size_bytes(self) -> int:
# For MLA we only store a single latent vector
if envs.VLLM_ATTN_OPT_LEVEL > 0:
return self.block_size * self.num_kv_heads * get_dtype_size(torch.float32)
else:
return 0
@property
def v_cache_scale_size_bytes(self) -> int:
return self.head_size * self.num_kv_heads * get_dtype_size(torch.float32)
@dataclass(frozen=True, kw_only=True)
@@ -118,7 +152,7 @@ class FullAttentionSpec(AttentionSpec):
# (max_model_len//dcp_world_size) tokens locally.
if dcp_world_size * pcp_world_size > 1:
max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size)
return cdiv(max_model_len, self.block_size) * self.page_size_bytes
return cdiv(max_model_len, self.block_size) * (self.page_size_bytes + self.scale_page_size_bytes)
@classmethod
def merge_window_sizes(cls, window_sizes: set[int]) -> int | None:
@@ -179,12 +213,28 @@ class FullAttentionSpec(AttentionSpec):
@property
def real_page_size_bytes(self) -> int:
return (
self.block_size
* self.num_kv_heads
* (self.head_size + self.head_size_v)
* get_dtype_size(self.dtype)
)
if envs.VLLM_ATTN_OPT_LEVEL == 1:
return (
self.block_size
* self.num_kv_heads
* (self.head_size + self.head_size_v)
* get_dtype_size(torch.int8)
)
elif envs.VLLM_ATTN_OPT_LEVEL == 2:
return self.block_size * self.num_kv_heads * self.head_size \
* get_dtype_size(torch.int8) + self.block_size * self.num_kv_heads * self.head_size_v \
* get_dtype_size(self.dtype)
else:
return (
self.block_size
* self.num_kv_heads
* (self.head_size + self.head_size_v)
* get_dtype_size(self.dtype)
)
@property
def v_cache_scale_size_bytes(self) -> int:
return self.head_size_v * self.num_kv_heads * get_dtype_size(torch.float32)
@dataclass(frozen=True, kw_only=True)
@@ -198,12 +248,30 @@ class MLAAttentionSpec(FullAttentionSpec):
# See `vllm/v1/attention/backends/mla/flashmla_sparse.py`
# for details.
return self.block_size * 656
if envs.VLLM_USE_INT8_MLA:
return (
self.block_size
* self.num_kv_heads
* self.head_size
* get_dtype_size(torch.int8)
)
return (
self.block_size
* self.num_kv_heads
* self.head_size
* get_dtype_size(self.dtype)
)
@property
def scale_page_size_bytes(self) -> int:
# For MLA we only store a single latent vector
if envs.VLLM_USE_INT8_MLA:
return (
self.block_size
* self.num_kv_heads * 2
* get_dtype_size(torch.float32)
)
else:
return 0
@classmethod
def merge(cls, specs: list[Self]) -> Self:
@@ -267,7 +335,7 @@ class SlidingWindowSpec(AttentionSpec):
# of the block. For example, if the block size is 4 and num_token
# is 4, we need two blocks [XXCD] [EF] to store the sliding
# window [CDEF] of 6 tokens.
return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
return (cdiv(num_tokens, self.block_size) + 1) * (self.page_size_bytes + self.scale_page_size_bytes)
@dataclass(frozen=True)
@@ -289,7 +357,6 @@ class MambaSpec(KVCacheSpec):
assert self.page_size_padded >= page_size
return self.page_size_padded
return page_size
@property
def scale_page_size_bytes(self) -> int:
return 0
@@ -389,6 +456,9 @@ class UniformTypeKVCacheSpecs(KVCacheSpec):
@property
def page_size_bytes(self) -> int:
return sum(spec.page_size_bytes for spec in self.kv_cache_specs.values())
@property
def scale_page_size_bytes(self) -> int:
return sum(spec.scale_page_size_bytes for spec in self.kv_cache_specs.values())
def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
max_num_pages = max(
@@ -460,6 +530,7 @@ class KVCacheTensor:
size: int # size of the KV cache tensor in bytes
shared_by: list[str] # layer names that share the same KV cache tensor
size_scale: int = 0 # size of the v_cache_scale tensor in bytes, only used for VLLM_ATTN_OPT_LEVEL == 1
@dataclass
@@ -486,6 +557,7 @@ class KVCacheConfig:
kv_cache_tensors: list[KVCacheTensor]
"""How should model runner initialize the KV cache tensors for each layer"""
kv_cache_groups: list[KVCacheGroupSpec]
kv_cache_scale_tensors: list[KVCacheTensor]
"""
The kv cache groups of the model.
For models with only one type of attention, there is only one group that
@@ -493,3 +565,11 @@ class KVCacheConfig:
For models with multiple types of attention, there will be multiple groups,
see `_get_kv_cache_config_uniform_page_size` for more details.
"""
@property
def has_mamba_layers(self) -> bool:
return any(isinstance(g.kv_cache_spec, MambaSpec) for g in self.kv_cache_groups)
@property
def needs_kv_cache_zeroing(self) -> bool:
return self.has_mamba_layers