Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -12,7 +12,7 @@ from vllm.config.vllm import VllmConfig
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention.kv_transfer_utils import (
-    maybe_transfer_kv_layer,
+    maybe_transfer_kv_layer
 )
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
@@ -40,6 +40,9 @@ from vllm.v1.kv_cache_interface import (
    KVCacheSpec,
    SlidingWindowSpec,
 )
+from .extra_cache import StaticQuantManager
+from ixformer.core import config
+_USE_TORCH_OPS = config.IXFORMER_USE_TORCH_OPS

 if TYPE_CHECKING:
    from vllm.model_executor.layers.attention import MLAAttention
@@ -202,6 +205,7 @@ class Attention(nn.Module, AttentionLayerBase):
        kv_sharing_target_layer_name: str | None = None,
        attn_backend: type[AttentionBackend] | None = None,
        head_size_v: int | None = None,
+        extra_cache_para: dict = None,
        **extra_impl_args,
    ) -> None:
        """
@@ -258,6 +262,7 @@ class Attention(nn.Module, AttentionLayerBase):

        self.num_heads = num_heads
        self.head_size = head_size
+        self.hidden_size = head_size * num_heads
        self.head_size_v = self.head_size if head_size_v is None else head_size_v
        self.num_kv_heads = num_kv_heads
        self.sliding_window = sliding_window
@@ -326,6 +331,15 @@ class Attention(nn.Module, AttentionLayerBase):
            kv_sharing_target_layer_name,
            **extra_impl_args,
        )
+        if extra_cache_para is not None:
+            self.quant_manager = StaticQuantManager(
+                layer_id=extra_cache_para.get("layer_id", None),
+                shape=(self.num_kv_heads, self.head_size_v),
+                dtype=torch.float32,
+                total_layer_num=extra_cache_para.get("total_layer_num", None)
+            )
+        else:
+            self.quant_manager = None
        self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
        self.dtype = dtype

@@ -333,7 +347,10 @@ class Attention(nn.Module, AttentionLayerBase):
        # torch.compile works by registering the attention as one giant
        # opaque custom op. For other platforms, we directly call them
        # and let torch.compile handle them.
-        self.use_direct_call = not current_platform.opaque_attention_op()
+        if _USE_TORCH_OPS:
+            self.use_direct_call = False
+        else:
+            self.use_direct_call = True

        self.use_output = self.attn_backend.accept_output_buffer
        compilation_config = vllm_config.compilation_config
@@ -349,14 +366,26 @@ class Attention(nn.Module, AttentionLayerBase):
                compilation_config.static_forward_context,
            )
        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-
-        # use a placeholder kv cache tensor during init, which will be replaced
-        # by bind_kv_cache
-        # this variable will not be accessed if use_direct_call is True
        self.kv_cache = [
            torch.tensor([])
            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
        ]
+        self.is_i8qi8ki8v = envs.VLLM_ATTN_OPT_LEVEL == 1
+        self.is_i8qi8kf16v = envs.VLLM_ATTN_OPT_LEVEL == 2
+        if self.is_i8qi8kf16v:
+            self.kv_cache_scale = [
+            torch.tensor([]) for _ in range(get_current_vllm_config(
+            ).parallel_config.pipeline_parallel_size)
+        ]
+        elif self.is_i8qi8ki8v:
+            self.kv_cache_scale = [
+                [torch.tensor([]), torch.tensor([])] for _ in range(get_current_vllm_config(
+                ).parallel_config.pipeline_parallel_size)
+            ]
+
+        # use a placeholder kv cache tensor during init, which will be replaced
+        # by bind_kv_cache
+        # this variable will not be accessed if use_direct_call is True

        # Initialize KV cache quantization attributes
        _init_kv_cache_quant(self, quant_config, prefix)
@@ -396,6 +425,7 @@ class Attention(nn.Module, AttentionLayerBase):
        context using
        `vllm.forward_context.get_forward_context().attn_metadata`.
        """
+        optional_args = {}
        if self.calculate_kv_scales:
            torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name)
        output_dtype = query.dtype
@@ -412,15 +442,8 @@ class Attention(nn.Module, AttentionLayerBase):
                query, _ = self.query_quant(query, self._q_scale)

        if self.use_output:
-            if output_shape is None:
-                # Handle both 2D [num_tokens, hidden] and
-                # 3D [num_tokens, heads, head_dim] query
-                num_tokens = query.shape[0]
-                output_shape = torch.Size(
-                    (num_tokens, self.num_heads * self.head_size_v)
-                )
+            output_shape = output_shape if output_shape is not None else query.shape
            output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
-            hidden_size = output_shape[-1]
            # Reshape the query, key, and value tensors.
            # NOTE(woosuk): We do this outside the custom op to minimize the
            # CPU overheads from the non-CUDA-graph regions.
@@ -430,46 +453,50 @@ class Attention(nn.Module, AttentionLayerBase):
                key = key.view(-1, self.num_kv_heads, self.head_size)
            if value is not None:
                value = value.view(-1, self.num_kv_heads, self.head_size_v)
-            kv_cache_dummy_dep = None
            if self.use_direct_call:
-                # Skip this if sharing KV cache with an earlier attention layer.
-                if (
-                    not self.attn_backend.forward_includes_kv_cache_update
-                    and self.kv_sharing_target_layer_name is None
-                    and key is not None
-                    and value is not None
-                ):
-                    kv_cache_dummy_dep = unified_kv_cache_update(
-                        key, value, self.layer_name
+                def direct_forward(layer_name: str, output: torch.Tensor):
+                    forward_context: ForwardContext = get_forward_context()
+                    attn_metadata = forward_context.attn_metadata
+                    if isinstance(attn_metadata, dict):
+                        attn_metadata = attn_metadata[layer_name]
+                    self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                    # Skip this if sharing KV cache with an earlier attention layer.
+                    if self.is_i8qi8ki8v or self.is_i8qi8kf16v:
+                        optional_args["kv_cache_scale"] = self.kv_cache_scale[forward_context.virtual_engine]
+                    output = self.impl.forward(
+                        self,
+                        query,
+                        key,
+                        value,
+                        self_kv_cache,
+                        attn_metadata,
+                        output=output,
+                        **optional_args
                    )
-                unified_attention_with_output(
-                    query,
-                    key,
-                    value,
-                    output,
-                    self.layer_name,
-                    kv_cache_dummy_dep=kv_cache_dummy_dep,
-                )
+                    return output
+                return maybe_transfer_kv_layer(direct_forward)(self.layer_name, output)
            else:
-                # Skip this if sharing KV cache with an earlier attention layer.
-                if (
-                    not self.attn_backend.forward_includes_kv_cache_update
-                    and self.kv_sharing_target_layer_name is None
-                    and key is not None
-                    and value is not None
-                ):
-                    kv_cache_dummy_dep = torch.ops.vllm.unified_kv_cache_update(
-                        key, value, self.layer_name
-                    )
+                if self.is_i8qi8ki8v:
+                    forward_context: ForwardContext = get_forward_context()
+                    kv_cache_scale = self.kv_cache_scale[forward_context.virtual_engine][0]
+                    v_cache_scale = self.kv_cache_scale[forward_context.virtual_engine][1]
+                elif self.is_i8qi8kf16v:
+                    forward_context: ForwardContext = get_forward_context()
+                    kv_cache_scale = self.kv_cache_scale[forward_context.virtual_engine]
+                    v_cache_scale = None
+                else:
+                    kv_cache_scale = None
+                    v_cache_scale = None
                torch.ops.vllm.unified_attention_with_output(
                    query,
                    key,
                    value,
                    output,
                    self.layer_name,
-                    kv_cache_dummy_dep=kv_cache_dummy_dep,
+                    kv_cache_scale,
+                    v_cache_scale
                )
-            return output.view(-1, hidden_size)
+                return output.view(-1, self.hidden_size)
        else:
            assert self.attn_backend.forward_includes_kv_cache_update, (
                "Split KV cache update not supported when output tensor not provided."
@@ -521,6 +548,7 @@ class Attention(nn.Module, AttentionLayerBase):
        block_size = vllm_config.cache_config.block_size
        # Should not be called for enc-dec or encoder-only attention.
        assert self.attn_type == AttentionType.DECODER
+        # TODO : kernel unsupport kvcache for sliding_window, use FullAttentionSpec replace
        if self.sliding_window is not None:
            assert not vllm_config.model_config.use_mla, (
                "MLA is not supported for slidingwindow"
@@ -689,6 +717,8 @@ def unified_attention_with_output(
    value: torch.Tensor,
    output: torch.Tensor,
    layer_name: str,
+    kv_cache_scale: torch.Tensor | None = None,
+    v_cache_scale: torch.Tensor | None = None,
    output_scale: torch.Tensor | None = None,
    output_block_scale: torch.Tensor | None = None,
    kv_cache_dummy_dep: torch.Tensor | None = None,
@@ -696,9 +726,7 @@ def unified_attention_with_output(
    # kv_cache_dummy_dep is not used but accepting it creates a data dependency
    # that ensures torch.compile preserves ordering between KV cache update and
    # attention forward.
-    del kv_cache_dummy_dep
    attn_metadata, self, kv_cache, _ = get_attention_context(layer_name)
-
    self.impl.forward(
        self,
        query,
@@ -707,6 +735,7 @@ def unified_attention_with_output(
        kv_cache,
        attn_metadata,
        output=output,
+        kv_cache_scale = [kv_cache_scale, v_cache_scale] if envs.VLLM_ATTN_OPT_LEVEL==1 else kv_cache_scale,
        output_scale=output_scale,
        output_block_scale=output_block_scale,
    )
@@ -718,6 +747,8 @@ def unified_attention_with_output_fake(
    value: torch.Tensor,
    output: torch.Tensor,
    layer_name: str,
+    kv_cache_scale: torch.Tensor | None = None,
+    v_cache_scale: torch.Tensor | None = None,
    output_scale: torch.Tensor | None = None,
    output_block_scale: torch.Tensor | None = None,
    kv_cache_dummy_dep: torch.Tensor | None = None,