[Bugfix]:replace npu_incre_flash_attention with npu_fused_infer_atten… (#2901)
### What this PR does / why we need it?
[Bugfix]:replace npu_incre_flash_attention with
npu_fused_infer_attention_score in order to be able to tiling update
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
- vLLM version: v0.10.2
- vLLM main:
2b85697031
Signed-off-by: p00465316 <panchao13@huawei.com>
Co-authored-by: p00465316 <panchao13@huawei.com>
This commit is contained in:
@@ -439,17 +439,24 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
|
||||
block_size = key_cache.shape[1]
|
||||
query = query.view(num_tokens, 1,
|
||||
self.num_heads * self.head_size).contiguous()
|
||||
output = torch_npu.npu_incre_flash_attention(
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
num_key_value_heads=self.num_kv_heads,
|
||||
output, _ = torch_npu.npu_fused_infer_attention_score(
|
||||
query=query,
|
||||
key=key_cache,
|
||||
value=value_cache,
|
||||
query_rope=None,
|
||||
key_rope=None,
|
||||
num_heads=self.num_heads,
|
||||
actual_seq_lengths=seq_lens,
|
||||
scale_value=self.scale,
|
||||
block_table=block_table,
|
||||
num_key_value_heads=self.num_kv_heads,
|
||||
input_layout='BSH',
|
||||
block_size=block_size)
|
||||
atten_mask=decode_meta.attn_mask,
|
||||
sparse_mode=0,
|
||||
scale=self.scale,
|
||||
antiquant_mode=0,
|
||||
antiquant_scale=None,
|
||||
block_table=block_table,
|
||||
block_size=block_size,
|
||||
actual_seq_lengths_kv=seq_lens,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Torchair graph mode with non-MLA attention backend is still experimental."
|
||||
|
||||
Reference in New Issue
Block a user