[Model] Support pooling models (#3122)

### What this PR does / why we need it? Support pooling models (like `bge-reranker-v2-m3`) in vllm-ascend, this pr covered the three model types of embed (cls_token, mean_token, lasttoken). After this [commit](17373dcd93), vllm has provided support for adapting pooling models on the v1 engine. This PR includes corresponding adaptations on the vllm-ascend side. Fixes #1960 - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: lianyibo <lianyibo1@kunlunit.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
2025-12-10 11:37:57 +08:00
parent 1a7a34c5ec
commit e32014ac1d
17 changed files with 577 additions and 338 deletions
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -35,7 +35,6 @@ class AttentionMaskBuilder:
        self.attn_mask_cache = None
        self._seq_len_cached = 0
        self.device = device
-        self.pooling_mask = None
        self.mla_mask = None
        self.chunked_prefill_attn_mask = None
        self.pcp_mla_mask = None
@@ -50,14 +49,6 @@ class AttentionMaskBuilder:
        return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous(
        ).to(self.device, non_blocking=True)

-    def get_pooling_mask(self):
-        if self.pooling_mask is None:
-            # the compressed attention mask for npu_fusion_attention sparse mode 4
-            self.pooling_mask = torch.triu(torch.ones(
-                2048, 2048), diagonal=1).to(torch.bool).to(self.device,
-                                                           non_blocking=True)
-        return self.pooling_mask
-
    def get_splitfuse_attn_mask(self) -> torch.Tensor:
        if self.chunked_prefill_attn_mask is None:
            self.chunked_prefill_attn_mask = torch.triu(