diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index e9ce36e..51fbae2 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -3,12 +3,15 @@ from unittest.mock import MagicMock, patch import torch from tests.ut.base import TestBase +from vllm_ascend.attention.attention_v1 import \ + AscendAttentionBackendImpl092 # isort: skip from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, AscendAttentionBackendImpl, AscendAttentionMetadataBuilder, AscendAttentionState, AscendMetadata, CommonAttentionState) +from vllm_ascend.utils import vllm_version_is class TestAscendAttentionBackend(TestBase): @@ -17,8 +20,12 @@ class TestAscendAttentionBackend(TestBase): self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND") def test_get_impl_cls(self): - self.assertEqual(AscendAttentionBackend.get_impl_cls(), - AscendAttentionBackendImpl) + if vllm_version_is("0.9.2"): + self.assertEqual(AscendAttentionBackend.get_impl_cls(), + AscendAttentionBackendImpl092) + else: + self.assertEqual(AscendAttentionBackend.get_impl_cls(), + AscendAttentionBackendImpl) def test_get_metadata_cls(self): self.assertEqual(AscendAttentionBackend.get_metadata_cls(), diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 7d7f488..b0e9f3b 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch from vllm_ascend.ops.attention import vanilla_chunked_prefill from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, nd_to_nz_spec) + nd_to_nz_2d, nd_to_nz_spec, vllm_version_is) class AscendAttentionBackend(AttentionBackend): @@ -43,6 +43,8 @@ class AscendAttentionBackend(AttentionBackend): @staticmethod def get_impl_cls() -> Type["AscendAttentionBackendImpl"]: + if vllm_version_is("0.9.2"): + return AscendAttentionBackendImpl092 return AscendAttentionBackendImpl @staticmethod @@ -222,7 +224,6 @@ class AscendAttentionBackendImpl(AttentionImpl): alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -437,6 +438,38 @@ class AscendAttentionBackendImpl(AttentionImpl): return output.view(num_tokens, self.hidden_size) +class AscendAttentionBackendImpl092(AscendAttentionBackendImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + kv_cache_dtype=kv_cache_dtype, + logits_soft_cap=logits_soft_cap, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + use_irope=use_irope, + ) + + def unified_ascend_attention_with_output( query: torch.Tensor, key: torch.Tensor, diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py index 9d9b91b..0c50290 100644 --- a/vllm_ascend/attention/attention_v1_torchair.py +++ b/vllm_ascend/attention/attention_v1_torchair.py @@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d) + nd_to_nz_2d, vllm_version_is) class AscendAttentionTorchairBackend(AttentionBackend): @@ -41,6 +41,8 @@ class AscendAttentionTorchairBackend(AttentionBackend): @staticmethod def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]: + if vllm_version_is("0.9.2"): + return AscendAttentionTorchairBackendImpl092 return AscendAttentionTorchairBackendImpl @staticmethod @@ -333,7 +335,6 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl): alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -501,3 +502,36 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl): "to use ascend scheduler.") return output.view(num_tokens, self.hidden_size) + + +class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl + ): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + ) -> None: + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + kv_cache_dtype=kv_cache_dtype, + logits_soft_cap=logits_soft_cap, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + use_irope=use_irope, + ) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index a8fb7bc..37e9454 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -1,11 +1,12 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, TypeVar +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, + TypeVar) import numpy as np import torch import torch_npu from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, - AttentionMetadata, + AttentionMetadata, AttentionType, MLAAttentionImpl) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import get_current_vllm_config @@ -20,7 +21,8 @@ from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig from vllm_ascend.multistream.context import get_multistream_comm_context from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla -from vllm_ascend.utils import npu_prefetch, npu_stream_switch, npu_wait_tensor +from vllm_ascend.utils import (npu_prefetch, npu_stream_switch, + npu_wait_tensor, vllm_version_is) from vllm_ascend.worker.npu_input_batch import InputBatch if TYPE_CHECKING: @@ -66,6 +68,8 @@ class AscendMLABackend(AttentionBackend): @staticmethod def get_impl_cls() -> Type["MLAAttentionImpl"]: + if vllm_version_is("0.9.2"): + return AscendMLAImpl092 return AscendMLAImpl @@ -533,7 +537,6 @@ class AscendMLAImpl(MLAAttentionImpl): alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str] = None, @@ -1226,3 +1229,34 @@ class AscendMLAImpl(MLAAttentionImpl): output[:num_decode_tokens] = output_decode return output_padded + + +class AscendMLAImpl092(AscendMLAImpl): + + def __init__(self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, + use_irope: bool = False, + **kwargs) -> None: + super().__init__( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=alibi_slopes, + sliding_window=sliding_window, + kv_cache_dtype=kv_cache_dtype, + logits_soft_cap=logits_soft_cap, + attn_type=attn_type, + kv_sharing_target_layer_name=kv_sharing_target_layer_name, + use_irope=use_irope, + **kwargs)