[Perf] Add FIA interface in FA case (#3321)

### What this PR does / why we need it? Add new npu_fused_infer_attention_score op to improve perfomance in flash attention case. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: ZYang6263 <zy626375@gmail.com>
2025-10-19 12:45:33 +08:00
parent 4b3bd4f397
commit 1e78ecbad6
2 changed files with 50 additions and 12 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -898,9 +898,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):

        # Prefill without cache situation.
        elif attn_state == AscendAttentionState.PrefillNoCache:
-            max_seq_len = max(seq_lens.max().item(), 0)
-            return self.attn_mask_builder.get_attn_mask(
-                max_seq_len, self.dtype, self.device)
+            if torch.version.cann.startswith("8.3"):
+                return self.attn_mask_builder.get_splitfuse_attn_mask()
+            else:
+                max_seq_len = max(seq_lens, default=0)
+                return self.attn_mask_builder.get_attn_mask(
+                    max_seq_len, self.dtype, self.device)
        # Prefill with cache hit.
        elif attn_state == AscendAttentionState.PrefillCacheHit:
            return self.attn_mask_builder.get_attn_mask(