Upgrade CANN to 8.3.rc1 (#3945)

### What this PR does / why we need it? This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version check logic. TODO: we notice that UT runs failed with CANN 8.3 image. So the base image for UT is still 8.2. We'll fix it later. - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-03 20:21:07 +08:00
parent 49d74785c4
commit cc2cd42ad3
39 changed files with 119 additions and 213 deletions
--- a/vllm_ascend/attention/attention_mask.py
+++ b/vllm_ascend/attention/attention_mask.py
@@ -47,11 +47,10 @@ class AttentionMaskBuilder:
        self.attn_mask_cache = attn_mask
        self.device = device
        self.pooling_mask = None
-        if torch.version.cann.startswith("8.3"):
-            assigned_mask_dim = 2048
-            self.chunked_prefill_attn_mask = torch.triu(
-                torch.ones(assigned_mask_dim, assigned_mask_dim),
-                diagonal=1).to(torch.int8).to(device)
+        assigned_mask_dim = 2048
+        self.chunked_prefill_attn_mask = torch.triu(
+            torch.ones(assigned_mask_dim, assigned_mask_dim),
+            diagonal=1).to(torch.int8).to(device)

    @staticmethod
    def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
@@ -68,7 +67,7 @@ class AttentionMaskBuilder:

    def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
                      device: torch.device):
-        if max_seq_len == 2048 and torch.version.cann.startswith("8.3"):
+        if max_seq_len == 2048:
            return self.chunked_prefill_attn_mask.to(torch.bool)
        self._update_attn_cache(max_seq_len, dtype)
        return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous(
@@ -89,23 +88,7 @@ class AttentionMaskBuilder:
        dtype: torch.dtype = None,
        device: torch.device = None,
    ) -> torch.Tensor:
-        if torch.version.cann.startswith("8.3"):
-            return self.chunked_prefill_attn_mask
-        else:
-            if dtype not in [torch.float16, torch.bfloat16]:
-                raise ValueError(
-                    "splitfuse_attn_mask now only supports bf16 and fp16")
-            max_seq_len = max(seq_lens, default=0)
-            self._update_attn_cache(max_seq_len, dtype)
-            # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
-            # is not the same. Fix this in the future when kernel is ready.
-            mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(
-                dtype)
-            attn_mask = torch.index_select(self.attn_mask_cache,
-                                           dim=0,
-                                           index=position)[:, :max_seq_len]
-            attn_mask *= mask_scale_factor
-            return attn_mask.contiguous().to(device, non_blocking=True)
+        return self.chunked_prefill_attn_mask

    def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
        if seqlen > self._seq_len_cached: