[Refactor] Remove redundant attention operator branches. (#4531)

[Refactor] Remove redundant attention operator branches.

Reason:

We replace other attention ops with fused_infer_attention_score expect
decode_only state.
clean code and remove 310P support.

https://github.com/vllm-project/vllm-ascend/pull/4455


- vLLM version: v0.11.2
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2

---------

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
weijinqian0
2025-12-02 09:13:26 +08:00
committed by GitHub
parent 981a14f8d5
commit b4bf01ead1
3 changed files with 119 additions and 470 deletions

View File

@@ -979,25 +979,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# dcp situation.
if self.dcp_size > 1:
return self.attn_mask_builder.get_splitfuse_attn_mask()
if self.vllm_config.model_config.use_mla:
return None
# Pooling situation.
if self.model_config.runner_type == "pooling" and self.model_config.pooler_config.pooling_type == "CLS":
return self.attn_mask_builder.get_pooling_mask(self.device)
# Chunk Prefill situation.
elif attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla and not self.use_sparse:
# fia prefill situation.
if attn_state in [
AscendAttentionState.PrefillNoCache,
AscendAttentionState.PrefillCacheHit,
AscendAttentionState.ChunkedPrefill
]:
return self.attn_mask_builder.get_splitfuse_attn_mask()
# Prefill without cache situation.
elif attn_state == AscendAttentionState.PrefillNoCache:
max_seq_len = max(seq_lens.max().item(), 0)
return self.attn_mask_builder.get_attn_mask(
max_seq_len, self.dtype, self.device)
# Prefill with cache hit.
elif attn_state == AscendAttentionState.PrefillCacheHit:
return self.attn_mask_builder.get_splitfuse_attn_mask().to(
torch.bool)
# Decode-only situation.
else:
return None
return None
def _make_fia_attention_mask(self) -> torch.Tensor:
# pcp situation.