[CI] Make AttentionBackend interface compatible to fix broken CI (#1893)

vLLM commit
752c6ade2e
removed `blocksparse_params` for attention backend. This PR does the
same change to make CI happy.


- vLLM version: v0.9.2
- vLLM main:
9499e26e2a

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
wangxiyuan
2025-07-21 08:21:06 +08:00
committed by GitHub
parent 54f2b31184
commit a8b316ac5b
4 changed files with 118 additions and 10 deletions

View File

@@ -3,12 +3,15 @@ from unittest.mock import MagicMock, patch
import torch
from tests.ut.base import TestBase
from vllm_ascend.attention.attention_v1 import \
AscendAttentionBackendImpl092 # isort: skip
from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
AscendAttentionBackendImpl,
AscendAttentionMetadataBuilder,
AscendAttentionState,
AscendMetadata,
CommonAttentionState)
from vllm_ascend.utils import vllm_version_is
class TestAscendAttentionBackend(TestBase):
@@ -17,8 +20,12 @@ class TestAscendAttentionBackend(TestBase):
self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")
def test_get_impl_cls(self):
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
AscendAttentionBackendImpl)
if vllm_version_is("0.9.2"):
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
AscendAttentionBackendImpl092)
else:
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
AscendAttentionBackendImpl)
def test_get_metadata_cls(self):
self.assertEqual(AscendAttentionBackend.get_metadata_cls(),

View File

@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.ops.attention import vanilla_chunked_prefill
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d, nd_to_nz_spec)
nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
class AscendAttentionBackend(AttentionBackend):
@@ -43,6 +43,8 @@ class AscendAttentionBackend(AttentionBackend):
@staticmethod
def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
if vllm_version_is("0.9.2"):
return AscendAttentionBackendImpl092
return AscendAttentionBackendImpl
@staticmethod
@@ -222,7 +224,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
@@ -437,6 +438,38 @@ class AscendAttentionBackendImpl(AttentionImpl):
return output.view(num_tokens, self.hidden_size)
class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
def __init__(
self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
)
def unified_ascend_attention_with_output(
query: torch.Tensor,
key: torch.Tensor,

View File

@@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
nd_to_nz_2d)
nd_to_nz_2d, vllm_version_is)
class AscendAttentionTorchairBackend(AttentionBackend):
@@ -41,6 +41,8 @@ class AscendAttentionTorchairBackend(AttentionBackend):
@staticmethod
def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
if vllm_version_is("0.9.2"):
return AscendAttentionTorchairBackendImpl092
return AscendAttentionTorchairBackendImpl
@staticmethod
@@ -333,7 +335,6 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
@@ -501,3 +502,36 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
"to use ascend scheduler.")
return output.view(num_tokens, self.hidden_size)
class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
):
def __init__(
self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
)

View File

@@ -1,11 +1,12 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, TypeVar
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
TypeVar)
import numpy as np
import torch
import torch_npu
from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
AttentionMetadata,
AttentionMetadata, AttentionType,
MLAAttentionImpl)
from vllm.attention.backends.utils import PAD_SLOT_ID
from vllm.config import get_current_vllm_config
@@ -20,7 +21,8 @@ from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
from vllm_ascend.multistream.context import get_multistream_comm_context
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
from vllm_ascend.utils import npu_prefetch, npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import (npu_prefetch, npu_stream_switch,
npu_wait_tensor, vllm_version_is)
from vllm_ascend.worker.npu_input_batch import InputBatch
if TYPE_CHECKING:
@@ -66,6 +68,8 @@ class AscendMLABackend(AttentionBackend):
@staticmethod
def get_impl_cls() -> Type["MLAAttentionImpl"]:
if vllm_version_is("0.9.2"):
return AscendMLAImpl092
return AscendMLAImpl
@@ -533,7 +537,6 @@ class AscendMLAImpl(MLAAttentionImpl):
alibi_slopes: Optional[list[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[dict[str, Any]],
logits_soft_cap: Optional[float],
attn_type: str,
kv_sharing_target_layer_name: Optional[str] = None,
@@ -1226,3 +1229,34 @@ class AscendMLAImpl(MLAAttentionImpl):
output[:num_decode_tokens] = output_decode
return output_padded
class AscendMLAImpl092(AscendMLAImpl):
def __init__(self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
**kwargs) -> None:
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
alibi_slopes=alibi_slopes,
sliding_window=sliding_window,
kv_cache_dtype=kv_cache_dtype,
logits_soft_cap=logits_soft_cap,
attn_type=attn_type,
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
use_irope=use_irope,
**kwargs)