[CI] Make AttentionBackend interface compatible to fix broken CI (#1893)
vLLM commit752c6ade2eremoved `blocksparse_params` for attention backend. This PR does the same change to make CI happy. - vLLM version: v0.9.2 - vLLM main:9499e26e2a--------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, nd_to_nz_spec)
|
||||
nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
|
||||
|
||||
|
||||
class AscendAttentionBackend(AttentionBackend):
|
||||
@@ -43,6 +43,8 @@ class AscendAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendAttentionBackendImpl092
|
||||
return AscendAttentionBackendImpl
|
||||
|
||||
@staticmethod
|
||||
@@ -222,7 +224,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
@@ -437,6 +438,38 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
return output.view(num_tokens, self.hidden_size)
|
||||
|
||||
|
||||
class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
)
|
||||
|
||||
|
||||
def unified_ascend_attention_with_output(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
|
||||
Reference in New Issue
Block a user