Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -12,7 +12,7 @@ from vllm.config.vllm import VllmConfig
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.logger import init_logger
from vllm.model_executor.layers.attention.kv_transfer_utils import (
maybe_transfer_kv_layer,
maybe_transfer_kv_layer
)
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
@@ -40,6 +40,9 @@ from vllm.v1.kv_cache_interface import (
KVCacheSpec,
SlidingWindowSpec,
)
from .extra_cache import StaticQuantManager
from ixformer.core import config
_USE_TORCH_OPS = config.IXFORMER_USE_TORCH_OPS
if TYPE_CHECKING:
from vllm.model_executor.layers.attention import MLAAttention
@@ -202,6 +205,7 @@ class Attention(nn.Module, AttentionLayerBase):
kv_sharing_target_layer_name: str | None = None,
attn_backend: type[AttentionBackend] | None = None,
head_size_v: int | None = None,
extra_cache_para: dict = None,
**extra_impl_args,
) -> None:
"""
@@ -258,6 +262,7 @@ class Attention(nn.Module, AttentionLayerBase):
self.num_heads = num_heads
self.head_size = head_size
self.hidden_size = head_size * num_heads
self.head_size_v = self.head_size if head_size_v is None else head_size_v
self.num_kv_heads = num_kv_heads
self.sliding_window = sliding_window
@@ -326,6 +331,15 @@ class Attention(nn.Module, AttentionLayerBase):
kv_sharing_target_layer_name,
**extra_impl_args,
)
if extra_cache_para is not None:
self.quant_manager = StaticQuantManager(
layer_id=extra_cache_para.get("layer_id", None),
shape=(self.num_kv_heads, self.head_size_v),
dtype=torch.float32,
total_layer_num=extra_cache_para.get("total_layer_num", None)
)
else:
self.quant_manager = None
self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
self.dtype = dtype
@@ -333,7 +347,10 @@ class Attention(nn.Module, AttentionLayerBase):
# torch.compile works by registering the attention as one giant
# opaque custom op. For other platforms, we directly call them
# and let torch.compile handle them.
self.use_direct_call = not current_platform.opaque_attention_op()
if _USE_TORCH_OPS:
self.use_direct_call = False
else:
self.use_direct_call = True
self.use_output = self.attn_backend.accept_output_buffer
compilation_config = vllm_config.compilation_config
@@ -349,14 +366,26 @@ class Attention(nn.Module, AttentionLayerBase):
compilation_config.static_forward_context,
)
self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
# use a placeholder kv cache tensor during init, which will be replaced
# by bind_kv_cache
# this variable will not be accessed if use_direct_call is True
self.kv_cache = [
torch.tensor([])
for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
]
self.is_i8qi8ki8v = envs.VLLM_ATTN_OPT_LEVEL == 1
self.is_i8qi8kf16v = envs.VLLM_ATTN_OPT_LEVEL == 2
if self.is_i8qi8kf16v:
self.kv_cache_scale = [
torch.tensor([]) for _ in range(get_current_vllm_config(
).parallel_config.pipeline_parallel_size)
]
elif self.is_i8qi8ki8v:
self.kv_cache_scale = [
[torch.tensor([]), torch.tensor([])] for _ in range(get_current_vllm_config(
).parallel_config.pipeline_parallel_size)
]
# use a placeholder kv cache tensor during init, which will be replaced
# by bind_kv_cache
# this variable will not be accessed if use_direct_call is True
# Initialize KV cache quantization attributes
_init_kv_cache_quant(self, quant_config, prefix)
@@ -396,6 +425,7 @@ class Attention(nn.Module, AttentionLayerBase):
context using
`vllm.forward_context.get_forward_context().attn_metadata`.
"""
optional_args = {}
if self.calculate_kv_scales:
torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name)
output_dtype = query.dtype
@@ -412,15 +442,8 @@ class Attention(nn.Module, AttentionLayerBase):
query, _ = self.query_quant(query, self._q_scale)
if self.use_output:
if output_shape is None:
# Handle both 2D [num_tokens, hidden] and
# 3D [num_tokens, heads, head_dim] query
num_tokens = query.shape[0]
output_shape = torch.Size(
(num_tokens, self.num_heads * self.head_size_v)
)
output_shape = output_shape if output_shape is not None else query.shape
output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
hidden_size = output_shape[-1]
# Reshape the query, key, and value tensors.
# NOTE(woosuk): We do this outside the custom op to minimize the
# CPU overheads from the non-CUDA-graph regions.
@@ -430,46 +453,50 @@ class Attention(nn.Module, AttentionLayerBase):
key = key.view(-1, self.num_kv_heads, self.head_size)
if value is not None:
value = value.view(-1, self.num_kv_heads, self.head_size_v)
kv_cache_dummy_dep = None
if self.use_direct_call:
# Skip this if sharing KV cache with an earlier attention layer.
if (
not self.attn_backend.forward_includes_kv_cache_update
and self.kv_sharing_target_layer_name is None
and key is not None
and value is not None
):
kv_cache_dummy_dep = unified_kv_cache_update(
key, value, self.layer_name
def direct_forward(layer_name: str, output: torch.Tensor):
forward_context: ForwardContext = get_forward_context()
attn_metadata = forward_context.attn_metadata
if isinstance(attn_metadata, dict):
attn_metadata = attn_metadata[layer_name]
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
# Skip this if sharing KV cache with an earlier attention layer.
if self.is_i8qi8ki8v or self.is_i8qi8kf16v:
optional_args["kv_cache_scale"] = self.kv_cache_scale[forward_context.virtual_engine]
output = self.impl.forward(
self,
query,
key,
value,
self_kv_cache,
attn_metadata,
output=output,
**optional_args
)
unified_attention_with_output(
query,
key,
value,
output,
self.layer_name,
kv_cache_dummy_dep=kv_cache_dummy_dep,
)
return output
return maybe_transfer_kv_layer(direct_forward)(self.layer_name, output)
else:
# Skip this if sharing KV cache with an earlier attention layer.
if (
not self.attn_backend.forward_includes_kv_cache_update
and self.kv_sharing_target_layer_name is None
and key is not None
and value is not None
):
kv_cache_dummy_dep = torch.ops.vllm.unified_kv_cache_update(
key, value, self.layer_name
)
if self.is_i8qi8ki8v:
forward_context: ForwardContext = get_forward_context()
kv_cache_scale = self.kv_cache_scale[forward_context.virtual_engine][0]
v_cache_scale = self.kv_cache_scale[forward_context.virtual_engine][1]
elif self.is_i8qi8kf16v:
forward_context: ForwardContext = get_forward_context()
kv_cache_scale = self.kv_cache_scale[forward_context.virtual_engine]
v_cache_scale = None
else:
kv_cache_scale = None
v_cache_scale = None
torch.ops.vllm.unified_attention_with_output(
query,
key,
value,
output,
self.layer_name,
kv_cache_dummy_dep=kv_cache_dummy_dep,
kv_cache_scale,
v_cache_scale
)
return output.view(-1, hidden_size)
return output.view(-1, self.hidden_size)
else:
assert self.attn_backend.forward_includes_kv_cache_update, (
"Split KV cache update not supported when output tensor not provided."
@@ -521,6 +548,7 @@ class Attention(nn.Module, AttentionLayerBase):
block_size = vllm_config.cache_config.block_size
# Should not be called for enc-dec or encoder-only attention.
assert self.attn_type == AttentionType.DECODER
# TODO : kernel unsupport kvcache for sliding_window, use FullAttentionSpec replace
if self.sliding_window is not None:
assert not vllm_config.model_config.use_mla, (
"MLA is not supported for slidingwindow"
@@ -689,6 +717,8 @@ def unified_attention_with_output(
value: torch.Tensor,
output: torch.Tensor,
layer_name: str,
kv_cache_scale: torch.Tensor | None = None,
v_cache_scale: torch.Tensor | None = None,
output_scale: torch.Tensor | None = None,
output_block_scale: torch.Tensor | None = None,
kv_cache_dummy_dep: torch.Tensor | None = None,
@@ -696,9 +726,7 @@ def unified_attention_with_output(
# kv_cache_dummy_dep is not used but accepting it creates a data dependency
# that ensures torch.compile preserves ordering between KV cache update and
# attention forward.
del kv_cache_dummy_dep
attn_metadata, self, kv_cache, _ = get_attention_context(layer_name)
self.impl.forward(
self,
query,
@@ -707,6 +735,7 @@ def unified_attention_with_output(
kv_cache,
attn_metadata,
output=output,
kv_cache_scale = [kv_cache_scale, v_cache_scale] if envs.VLLM_ATTN_OPT_LEVEL==1 else kv_cache_scale,
output_scale=output_scale,
output_block_scale=output_block_scale,
)
@@ -718,6 +747,8 @@ def unified_attention_with_output_fake(
value: torch.Tensor,
output: torch.Tensor,
layer_name: str,
kv_cache_scale: torch.Tensor | None = None,
v_cache_scale: torch.Tensor | None = None,
output_scale: torch.Tensor | None = None,
output_block_scale: torch.Tensor | None = None,
kv_cache_dummy_dep: torch.Tensor | None = None,