[Refactor] Remove redundant attention operator branches. (#4531)
[Refactor] Remove redundant attention operator branches. Reason: We replace other attention ops with fused_infer_attention_score expect decode_only state. clean code and remove 310P support. https://github.com/vllm-project/vllm-ascend/pull/4455 - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
@@ -979,25 +979,21 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# dcp situation.
|
||||
if self.dcp_size > 1:
|
||||
return self.attn_mask_builder.get_splitfuse_attn_mask()
|
||||
if self.vllm_config.model_config.use_mla:
|
||||
return None
|
||||
# Pooling situation.
|
||||
if self.model_config.runner_type == "pooling" and self.model_config.pooler_config.pooling_type == "CLS":
|
||||
return self.attn_mask_builder.get_pooling_mask(self.device)
|
||||
# Chunk Prefill situation.
|
||||
elif attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla and not self.use_sparse:
|
||||
# fia prefill situation.
|
||||
if attn_state in [
|
||||
AscendAttentionState.PrefillNoCache,
|
||||
AscendAttentionState.PrefillCacheHit,
|
||||
AscendAttentionState.ChunkedPrefill
|
||||
]:
|
||||
return self.attn_mask_builder.get_splitfuse_attn_mask()
|
||||
|
||||
# Prefill without cache situation.
|
||||
elif attn_state == AscendAttentionState.PrefillNoCache:
|
||||
max_seq_len = max(seq_lens.max().item(), 0)
|
||||
return self.attn_mask_builder.get_attn_mask(
|
||||
max_seq_len, self.dtype, self.device)
|
||||
# Prefill with cache hit.
|
||||
elif attn_state == AscendAttentionState.PrefillCacheHit:
|
||||
return self.attn_mask_builder.get_splitfuse_attn_mask().to(
|
||||
torch.bool)
|
||||
# Decode-only situation.
|
||||
else:
|
||||
return None
|
||||
return None
|
||||
|
||||
def _make_fia_attention_mask(self) -> torch.Tensor:
|
||||
# pcp situation.
|
||||
|
||||
Reference in New Issue
Block a user