[Refactor] Remove redundant attention operator branches. (#4531)

[Refactor] Remove redundant attention operator branches. Reason： We replace other attention ops with fused_infer_attention_score expect decode_only state. clean code and remove 310P support. https://github.com/vllm-project/vllm-ascend/pull/4455 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-12-02 09:13:26 +08:00
parent 981a14f8d5
commit b4bf01ead1
3 changed files with 119 additions and 470 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -979,25 +979,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        # dcp situation.
        if self.dcp_size > 1:
            return self.attn_mask_builder.get_splitfuse_attn_mask()
+        if self.vllm_config.model_config.use_mla:
+            return None
        # Pooling situation.
        if self.model_config.runner_type == "pooling" and self.model_config.pooler_config.pooling_type == "CLS":
            return self.attn_mask_builder.get_pooling_mask(self.device)
-        # Chunk Prefill situation.
-        elif attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla and not self.use_sparse:
+        # fia prefill situation.
+        if attn_state in [
+                AscendAttentionState.PrefillNoCache,
+                AscendAttentionState.PrefillCacheHit,
+                AscendAttentionState.ChunkedPrefill
+        ]:
            return self.attn_mask_builder.get_splitfuse_attn_mask()

-        # Prefill without cache situation.
-        elif attn_state == AscendAttentionState.PrefillNoCache:
-            max_seq_len = max(seq_lens.max().item(), 0)
-            return self.attn_mask_builder.get_attn_mask(
-                max_seq_len, self.dtype, self.device)
-        # Prefill with cache hit.
-        elif attn_state == AscendAttentionState.PrefillCacheHit:
-            return self.attn_mask_builder.get_splitfuse_attn_mask().to(
-                torch.bool)
        # Decode-only situation.
-        else:
-            return None
+        return None

    def _make_fia_attention_mask(self) -> torch.Tensor:
        # pcp situation.