[Bugfix]:replace npu_incre_flash_attention with npu_fused_infer_atten… (#2901)

### What this PR does / why we need it? [Bugfix]:replace npu_incre_flash_attention with npu_fused_infer_attention_score in order to be able to tiling update ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: 2b85697031 Signed-off-by: p00465316 <panchao13@huawei.com> Co-authored-by: p00465316 <panchao13@huawei.com>
2025-09-18 14:06:08 +08:00
parent 6681dde902
commit a7f8ed38ed
2 changed files with 111 additions and 9 deletions
--- a/vllm_ascend/torchair/torchair_attention.py
+++ b/vllm_ascend/torchair/torchair_attention.py
@@ -439,17 +439,24 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
            block_size = key_cache.shape[1]
            query = query.view(num_tokens, 1,
                               self.num_heads * self.head_size).contiguous()
-            output = torch_npu.npu_incre_flash_attention(
-                query,
-                key_cache,
-                value_cache,
-                num_key_value_heads=self.num_kv_heads,
+            output, _ = torch_npu.npu_fused_infer_attention_score(
+                query=query,
+                key=key_cache,
+                value=value_cache,
+                query_rope=None,
+                key_rope=None,
                num_heads=self.num_heads,
-                actual_seq_lengths=seq_lens,
-                scale_value=self.scale,
-                block_table=block_table,
+                num_key_value_heads=self.num_kv_heads,
                input_layout='BSH',
-                block_size=block_size)
+                atten_mask=decode_meta.attn_mask,
+                sparse_mode=0,
+                scale=self.scale,
+                antiquant_mode=0,
+                antiquant_scale=None,
+                block_table=block_table,
+                block_size=block_size,
+                actual_seq_lengths_kv=seq_lens,
+            )
        else:
            raise NotImplementedError(
                "Torchair graph mode with non-MLA attention backend is still experimental."