Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -12,7 +12,7 @@ from vllm.config.vllm import VllmConfig
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.attention.kv_transfer_utils import (
|
||||
maybe_transfer_kv_layer,
|
||||
maybe_transfer_kv_layer
|
||||
)
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
|
||||
@@ -40,6 +40,9 @@ from vllm.v1.kv_cache_interface import (
|
||||
KVCacheSpec,
|
||||
SlidingWindowSpec,
|
||||
)
|
||||
from .extra_cache import StaticQuantManager
|
||||
from ixformer.core import config
|
||||
_USE_TORCH_OPS = config.IXFORMER_USE_TORCH_OPS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.layers.attention import MLAAttention
|
||||
@@ -202,6 +205,7 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
kv_sharing_target_layer_name: str | None = None,
|
||||
attn_backend: type[AttentionBackend] | None = None,
|
||||
head_size_v: int | None = None,
|
||||
extra_cache_para: dict = None,
|
||||
**extra_impl_args,
|
||||
) -> None:
|
||||
"""
|
||||
@@ -258,6 +262,7 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.hidden_size = head_size * num_heads
|
||||
self.head_size_v = self.head_size if head_size_v is None else head_size_v
|
||||
self.num_kv_heads = num_kv_heads
|
||||
self.sliding_window = sliding_window
|
||||
@@ -326,6 +331,15 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
kv_sharing_target_layer_name,
|
||||
**extra_impl_args,
|
||||
)
|
||||
if extra_cache_para is not None:
|
||||
self.quant_manager = StaticQuantManager(
|
||||
layer_id=extra_cache_para.get("layer_id", None),
|
||||
shape=(self.num_kv_heads, self.head_size_v),
|
||||
dtype=torch.float32,
|
||||
total_layer_num=extra_cache_para.get("total_layer_num", None)
|
||||
)
|
||||
else:
|
||||
self.quant_manager = None
|
||||
self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
|
||||
self.dtype = dtype
|
||||
|
||||
@@ -333,7 +347,10 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
# torch.compile works by registering the attention as one giant
|
||||
# opaque custom op. For other platforms, we directly call them
|
||||
# and let torch.compile handle them.
|
||||
self.use_direct_call = not current_platform.opaque_attention_op()
|
||||
if _USE_TORCH_OPS:
|
||||
self.use_direct_call = False
|
||||
else:
|
||||
self.use_direct_call = True
|
||||
|
||||
self.use_output = self.attn_backend.accept_output_buffer
|
||||
compilation_config = vllm_config.compilation_config
|
||||
@@ -349,14 +366,26 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
compilation_config.static_forward_context,
|
||||
)
|
||||
self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
|
||||
|
||||
# use a placeholder kv cache tensor during init, which will be replaced
|
||||
# by bind_kv_cache
|
||||
# this variable will not be accessed if use_direct_call is True
|
||||
self.kv_cache = [
|
||||
torch.tensor([])
|
||||
for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
self.is_i8qi8ki8v = envs.VLLM_ATTN_OPT_LEVEL == 1
|
||||
self.is_i8qi8kf16v = envs.VLLM_ATTN_OPT_LEVEL == 2
|
||||
if self.is_i8qi8kf16v:
|
||||
self.kv_cache_scale = [
|
||||
torch.tensor([]) for _ in range(get_current_vllm_config(
|
||||
).parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
elif self.is_i8qi8ki8v:
|
||||
self.kv_cache_scale = [
|
||||
[torch.tensor([]), torch.tensor([])] for _ in range(get_current_vllm_config(
|
||||
).parallel_config.pipeline_parallel_size)
|
||||
]
|
||||
|
||||
# use a placeholder kv cache tensor during init, which will be replaced
|
||||
# by bind_kv_cache
|
||||
# this variable will not be accessed if use_direct_call is True
|
||||
|
||||
# Initialize KV cache quantization attributes
|
||||
_init_kv_cache_quant(self, quant_config, prefix)
|
||||
@@ -396,6 +425,7 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
context using
|
||||
`vllm.forward_context.get_forward_context().attn_metadata`.
|
||||
"""
|
||||
optional_args = {}
|
||||
if self.calculate_kv_scales:
|
||||
torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name)
|
||||
output_dtype = query.dtype
|
||||
@@ -412,15 +442,8 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
query, _ = self.query_quant(query, self._q_scale)
|
||||
|
||||
if self.use_output:
|
||||
if output_shape is None:
|
||||
# Handle both 2D [num_tokens, hidden] and
|
||||
# 3D [num_tokens, heads, head_dim] query
|
||||
num_tokens = query.shape[0]
|
||||
output_shape = torch.Size(
|
||||
(num_tokens, self.num_heads * self.head_size_v)
|
||||
)
|
||||
output_shape = output_shape if output_shape is not None else query.shape
|
||||
output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
|
||||
hidden_size = output_shape[-1]
|
||||
# Reshape the query, key, and value tensors.
|
||||
# NOTE(woosuk): We do this outside the custom op to minimize the
|
||||
# CPU overheads from the non-CUDA-graph regions.
|
||||
@@ -430,46 +453,50 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
key = key.view(-1, self.num_kv_heads, self.head_size)
|
||||
if value is not None:
|
||||
value = value.view(-1, self.num_kv_heads, self.head_size_v)
|
||||
kv_cache_dummy_dep = None
|
||||
if self.use_direct_call:
|
||||
# Skip this if sharing KV cache with an earlier attention layer.
|
||||
if (
|
||||
not self.attn_backend.forward_includes_kv_cache_update
|
||||
and self.kv_sharing_target_layer_name is None
|
||||
and key is not None
|
||||
and value is not None
|
||||
):
|
||||
kv_cache_dummy_dep = unified_kv_cache_update(
|
||||
key, value, self.layer_name
|
||||
def direct_forward(layer_name: str, output: torch.Tensor):
|
||||
forward_context: ForwardContext = get_forward_context()
|
||||
attn_metadata = forward_context.attn_metadata
|
||||
if isinstance(attn_metadata, dict):
|
||||
attn_metadata = attn_metadata[layer_name]
|
||||
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
|
||||
# Skip this if sharing KV cache with an earlier attention layer.
|
||||
if self.is_i8qi8ki8v or self.is_i8qi8kf16v:
|
||||
optional_args["kv_cache_scale"] = self.kv_cache_scale[forward_context.virtual_engine]
|
||||
output = self.impl.forward(
|
||||
self,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
self_kv_cache,
|
||||
attn_metadata,
|
||||
output=output,
|
||||
**optional_args
|
||||
)
|
||||
unified_attention_with_output(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
output,
|
||||
self.layer_name,
|
||||
kv_cache_dummy_dep=kv_cache_dummy_dep,
|
||||
)
|
||||
return output
|
||||
return maybe_transfer_kv_layer(direct_forward)(self.layer_name, output)
|
||||
else:
|
||||
# Skip this if sharing KV cache with an earlier attention layer.
|
||||
if (
|
||||
not self.attn_backend.forward_includes_kv_cache_update
|
||||
and self.kv_sharing_target_layer_name is None
|
||||
and key is not None
|
||||
and value is not None
|
||||
):
|
||||
kv_cache_dummy_dep = torch.ops.vllm.unified_kv_cache_update(
|
||||
key, value, self.layer_name
|
||||
)
|
||||
if self.is_i8qi8ki8v:
|
||||
forward_context: ForwardContext = get_forward_context()
|
||||
kv_cache_scale = self.kv_cache_scale[forward_context.virtual_engine][0]
|
||||
v_cache_scale = self.kv_cache_scale[forward_context.virtual_engine][1]
|
||||
elif self.is_i8qi8kf16v:
|
||||
forward_context: ForwardContext = get_forward_context()
|
||||
kv_cache_scale = self.kv_cache_scale[forward_context.virtual_engine]
|
||||
v_cache_scale = None
|
||||
else:
|
||||
kv_cache_scale = None
|
||||
v_cache_scale = None
|
||||
torch.ops.vllm.unified_attention_with_output(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
output,
|
||||
self.layer_name,
|
||||
kv_cache_dummy_dep=kv_cache_dummy_dep,
|
||||
kv_cache_scale,
|
||||
v_cache_scale
|
||||
)
|
||||
return output.view(-1, hidden_size)
|
||||
return output.view(-1, self.hidden_size)
|
||||
else:
|
||||
assert self.attn_backend.forward_includes_kv_cache_update, (
|
||||
"Split KV cache update not supported when output tensor not provided."
|
||||
@@ -521,6 +548,7 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
block_size = vllm_config.cache_config.block_size
|
||||
# Should not be called for enc-dec or encoder-only attention.
|
||||
assert self.attn_type == AttentionType.DECODER
|
||||
# TODO : kernel unsupport kvcache for sliding_window, use FullAttentionSpec replace
|
||||
if self.sliding_window is not None:
|
||||
assert not vllm_config.model_config.use_mla, (
|
||||
"MLA is not supported for slidingwindow"
|
||||
@@ -689,6 +717,8 @@ def unified_attention_with_output(
|
||||
value: torch.Tensor,
|
||||
output: torch.Tensor,
|
||||
layer_name: str,
|
||||
kv_cache_scale: torch.Tensor | None = None,
|
||||
v_cache_scale: torch.Tensor | None = None,
|
||||
output_scale: torch.Tensor | None = None,
|
||||
output_block_scale: torch.Tensor | None = None,
|
||||
kv_cache_dummy_dep: torch.Tensor | None = None,
|
||||
@@ -696,9 +726,7 @@ def unified_attention_with_output(
|
||||
# kv_cache_dummy_dep is not used but accepting it creates a data dependency
|
||||
# that ensures torch.compile preserves ordering between KV cache update and
|
||||
# attention forward.
|
||||
del kv_cache_dummy_dep
|
||||
attn_metadata, self, kv_cache, _ = get_attention_context(layer_name)
|
||||
|
||||
self.impl.forward(
|
||||
self,
|
||||
query,
|
||||
@@ -707,6 +735,7 @@ def unified_attention_with_output(
|
||||
kv_cache,
|
||||
attn_metadata,
|
||||
output=output,
|
||||
kv_cache_scale = [kv_cache_scale, v_cache_scale] if envs.VLLM_ATTN_OPT_LEVEL==1 else kv_cache_scale,
|
||||
output_scale=output_scale,
|
||||
output_block_scale=output_block_scale,
|
||||
)
|
||||
@@ -718,6 +747,8 @@ def unified_attention_with_output_fake(
|
||||
value: torch.Tensor,
|
||||
output: torch.Tensor,
|
||||
layer_name: str,
|
||||
kv_cache_scale: torch.Tensor | None = None,
|
||||
v_cache_scale: torch.Tensor | None = None,
|
||||
output_scale: torch.Tensor | None = None,
|
||||
output_block_scale: torch.Tensor | None = None,
|
||||
kv_cache_dummy_dep: torch.Tensor | None = None,
|
||||
|
||||
Reference in New Issue
Block a user