Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -12,6 +12,7 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import get_dtype_size
+import vllm.envs as envs

 logger = init_logger(__name__)

@@ -34,6 +35,25 @@ class KVCacheSpec:
            The page size
        """
        raise NotImplementedError
+    @property
+    def scale_page_size_bytes(self) -> int:
+        """
+        The size of a scale page with `block_size` tokens in bytes.
+
+        Returns:
+            The scale page size
+        """
+        raise NotImplementedError
+    
+    @property
+    def v_cache_scale_size_bytes(self) -> int:
+        """
+        The size of a scale page with `block_size` tokens in bytes.
+
+        Returns:
+            The scale page size
+        """
+        raise NotImplementedError

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        """
@@ -78,13 +98,27 @@ class AttentionSpec(KVCacheSpec):

    @property
    def real_page_size_bytes(self) -> int:
-        return (
-            2
-            * self.block_size
-            * self.num_kv_heads
-            * self.head_size
-            * get_dtype_size(self.dtype)
-        )
+        if envs.VLLM_ATTN_OPT_LEVEL == 1:
+            # mla 和 i8qi8ki8v 申请的内存一样
+            return 2 * self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(torch.int8)
+        elif envs.VLLM_ATTN_OPT_LEVEL == 2:
+            # i8qi8kf16v 申请的内存是f16+int8，所以是3
+            return 3 * self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(torch.int8)
+        return 2 * self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(self.dtype)
+    @property     
+    def scale_page_size_bytes(self) -> int:
+        # For MLA we only store a single latent vector
+        if envs.VLLM_ATTN_OPT_LEVEL > 0:
+            return self.block_size * self.num_kv_heads * get_dtype_size(torch.float32)
+        else:
+            return 0
+        
+    @property 
+    def v_cache_scale_size_bytes(self) -> int:
+        return self.head_size * self.num_kv_heads * get_dtype_size(torch.float32)


@dataclass(frozen=True, kw_only=True)
@@ -118,7 +152,7 @@ class FullAttentionSpec(AttentionSpec):
        # (max_model_len//dcp_world_size) tokens locally.
        if dcp_world_size * pcp_world_size > 1:
            max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size)
-        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
+        return cdiv(max_model_len, self.block_size) * (self.page_size_bytes + self.scale_page_size_bytes)

    @classmethod
    def merge_window_sizes(cls, window_sizes: set[int]) -> int | None:
@@ -179,12 +213,28 @@ class FullAttentionSpec(AttentionSpec):

    @property
    def real_page_size_bytes(self) -> int:
-        return (
-            self.block_size
-            * self.num_kv_heads
-            * (self.head_size + self.head_size_v)
-            * get_dtype_size(self.dtype)
-        )
+        if envs.VLLM_ATTN_OPT_LEVEL == 1:
+            return (
+                self.block_size
+                * self.num_kv_heads
+                * (self.head_size + self.head_size_v)
+                * get_dtype_size(torch.int8)
+            )
+        elif envs.VLLM_ATTN_OPT_LEVEL == 2:
+            return self.block_size * self.num_kv_heads * self.head_size \
+                * get_dtype_size(torch.int8) + self.block_size * self.num_kv_heads * self.head_size_v \
+                * get_dtype_size(self.dtype)
+        else:
+            return (
+                self.block_size
+                * self.num_kv_heads
+                * (self.head_size + self.head_size_v)
+                * get_dtype_size(self.dtype)
+            )
+    @property 
+    def v_cache_scale_size_bytes(self) -> int:
+        return self.head_size_v * self.num_kv_heads * get_dtype_size(torch.float32)
+            


@dataclass(frozen=True, kw_only=True)
@@ -198,12 +248,30 @@ class MLAAttentionSpec(FullAttentionSpec):
            # See `vllm/v1/attention/backends/mla/flashmla_sparse.py`
            #  for details.
            return self.block_size * 656
+        if envs.VLLM_USE_INT8_MLA:
+            return (
+                self.block_size 
+                * self.num_kv_heads 
+                * self.head_size 
+                * get_dtype_size(torch.int8)
+            )
        return (
            self.block_size
            * self.num_kv_heads
            * self.head_size
            * get_dtype_size(self.dtype)
        )
+    @property           
+    def scale_page_size_bytes(self) -> int:
+        # For MLA we only store a single latent vector
+        if envs.VLLM_USE_INT8_MLA:
+            return (
+                self.block_size 
+                * self.num_kv_heads * 2  
+                * get_dtype_size(torch.float32)
+            )
+        else:
+            return 0

    @classmethod
    def merge(cls, specs: list[Self]) -> Self:
@@ -267,7 +335,7 @@ class SlidingWindowSpec(AttentionSpec):
        # of the block. For example, if the block size is 4 and num_token
        # is 4, we need two blocks [XXCD] [EF] to store the sliding
        # window [CDEF] of 6 tokens.
-        return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
+        return (cdiv(num_tokens, self.block_size) + 1) * (self.page_size_bytes + self.scale_page_size_bytes)


@dataclass(frozen=True)
@@ -289,7 +357,6 @@ class MambaSpec(KVCacheSpec):
            assert self.page_size_padded >= page_size
            return self.page_size_padded
        return page_size
-    
    @property
    def scale_page_size_bytes(self) -> int:
        return 0
@@ -389,6 +456,9 @@ class UniformTypeKVCacheSpecs(KVCacheSpec):
    @property
    def page_size_bytes(self) -> int:
        return sum(spec.page_size_bytes for spec in self.kv_cache_specs.values())
+    @property           
+    def scale_page_size_bytes(self) -> int:
+        return sum(spec.scale_page_size_bytes for spec in self.kv_cache_specs.values())

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        max_num_pages = max(
@@ -460,6 +530,7 @@ class KVCacheTensor:

    size: int  # size of the KV cache tensor in bytes
    shared_by: list[str]  # layer names that share the same KV cache tensor
+    size_scale: int = 0 # size of the v_cache_scale tensor in bytes, only used for VLLM_ATTN_OPT_LEVEL == 1


@dataclass
@@ -486,6 +557,7 @@ class KVCacheConfig:
    kv_cache_tensors: list[KVCacheTensor]
    """How should model runner initialize the KV cache tensors for each layer"""
    kv_cache_groups: list[KVCacheGroupSpec]
+    kv_cache_scale_tensors: list[KVCacheTensor]
    """
    The kv cache groups of the model.
    For models with only one type of attention, there is only one group that
@@ -493,3 +565,11 @@ class KVCacheConfig:
    For models with multiple types of attention, there will be multiple groups,
    see `_get_kv_cache_config_uniform_page_size` for more details.
    """
+
+    @property
+    def has_mamba_layers(self) -> bool:
+        return any(isinstance(g.kv_cache_spec, MambaSpec) for g in self.kv_cache_groups)
+
+    @property
+    def needs_kv_cache_zeroing(self) -> bool:
+        return self.has_mamba_layers