Upgrade CANN to 8.3.rc1 (#3945)

### What this PR does / why we need it? This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version check logic. TODO: we notice that UT runs failed with CANN 8.3 image. So the base image for UT is still 8.2. We'll fix it later. - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-03 20:21:07 +08:00
parent 49d74785c4
commit cc2cd42ad3
39 changed files with 119 additions and 213 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -367,13 +367,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                                             use_sparse=self.use_sparse)
        if self.pcp_size > 1:
            self.attn_mask_builder = None
-        elif torch.version.cann.startswith("8.3"):
+        else:
            self.attn_mask_builder = AttentionMaskBuilder(
                self.scheduler_config.max_num_batched_tokens, self.dtype,
                self.device)
-        else:
-            self.attn_mask_builder = AttentionMaskBuilder(
-                self.model_config.max_model_len, self.dtype)

        self._set_up_drafter()

@@ -988,11 +985,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                max_seq_len = max(seq_lens.max().item(), 0)
                return self.attn_mask_builder.get_attn_mask(
                    max_seq_len, self.dtype, self.device)
-            elif torch.version.cann.startswith("8.3"):
-                return self.attn_mask_builder.get_splitfuse_attn_mask()
            else:
-                return self.attn_mask_builder.get_splitfuse_attn_mask(
-                    seq_lens, position, self.dtype, self.device)
+                return self.attn_mask_builder.get_splitfuse_attn_mask()

        # Prefill without cache situation.
        elif attn_state == AscendAttentionState.PrefillNoCache:
@@ -1001,12 +995,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
                max_seq_len, self.dtype, self.device)
        # Prefill with cache hit.
        elif attn_state == AscendAttentionState.PrefillCacheHit:
-            if torch.version.cann.startswith("8.3"):
-                return self.attn_mask_builder.get_attn_mask(
-                    2048, self.dtype, self.device)
-            else:
-                return self.attn_mask_builder.get_attn_mask(
-                    128, self.dtype, self.device)
+            return self.attn_mask_builder.get_attn_mask(
+                2048, self.dtype, self.device)
        # Decode-only situation.
        else:
            return None