Upgrade CANN to 8.3.rc1 (#3945)

### What this PR does / why we need it?
This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version
check logic.

TODO: we notice that UT runs failed with CANN 8.3 image. So the base
image for UT is still 8.2. We'll fix it later.


- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-03 20:21:07 +08:00
committed by GitHub
parent 49d74785c4
commit cc2cd42ad3
39 changed files with 119 additions and 213 deletions

View File

@@ -47,11 +47,10 @@ class AttentionMaskBuilder:
self.attn_mask_cache = attn_mask
self.device = device
self.pooling_mask = None
if torch.version.cann.startswith("8.3"):
assigned_mask_dim = 2048
self.chunked_prefill_attn_mask = torch.triu(
torch.ones(assigned_mask_dim, assigned_mask_dim),
diagonal=1).to(torch.int8).to(device)
assigned_mask_dim = 2048
self.chunked_prefill_attn_mask = torch.triu(
torch.ones(assigned_mask_dim, assigned_mask_dim),
diagonal=1).to(torch.int8).to(device)
@staticmethod
def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
@@ -68,7 +67,7 @@ class AttentionMaskBuilder:
def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
device: torch.device):
if max_seq_len == 2048 and torch.version.cann.startswith("8.3"):
if max_seq_len == 2048:
return self.chunked_prefill_attn_mask.to(torch.bool)
self._update_attn_cache(max_seq_len, dtype)
return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous(
@@ -89,23 +88,7 @@ class AttentionMaskBuilder:
dtype: torch.dtype = None,
device: torch.device = None,
) -> torch.Tensor:
if torch.version.cann.startswith("8.3"):
return self.chunked_prefill_attn_mask
else:
if dtype not in [torch.float16, torch.bfloat16]:
raise ValueError(
"splitfuse_attn_mask now only supports bf16 and fp16")
max_seq_len = max(seq_lens, default=0)
self._update_attn_cache(max_seq_len, dtype)
# FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
# is not the same. Fix this in the future when kernel is ready.
mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(
dtype)
attn_mask = torch.index_select(self.attn_mask_cache,
dim=0,
index=position)[:, :max_seq_len]
attn_mask *= mask_scale_factor
return attn_mask.contiguous().to(device, non_blocking=True)
return self.chunked_prefill_attn_mask
def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
if seqlen > self._seq_len_cached:

View File

@@ -500,7 +500,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
block_table = attn_metadata.block_tables[:batch_size, :]
num_block, block_size, _, _ = self.key_cache.shape # type: ignore
if torch.version.cann.startswith("8.3") and block_size == 128:
if block_size == 128:
# TODO:The npu_fused_infer_attention_score op is planned to
# be utilized in a wider range in upcoming versions.
key = self.key_cache.view( # type: ignore
@@ -680,43 +680,30 @@ class AscendAttentionBackendImpl(AttentionImpl):
attn_metadata.seq_lens = \
attn_metadata.seq_lens.to(device=query.device)
if torch.version.cann.startswith("8.3"):
# TODO:The npu_fused_infer_attention_score op is planned to
# be utilized in a wider range in upcoming versions.
num_block, block_size, _, _ = self.key_cache.shape # type: ignore
key = self.key_cache.view( # type: ignore
num_block, block_size, -1)
value = self.value_cache.view( # type: ignore
num_block, block_size, -1)
# TODO:The npu_fused_infer_attention_score op is planned to
# be utilized in a wider range in upcoming versions.
num_block, block_size, _, _ = self.key_cache.shape # type: ignore
key = self.key_cache.view( # type: ignore
num_block, block_size, -1)
value = self.value_cache.view( # type: ignore
num_block, block_size, -1)
output, _ = torch_npu.npu_fused_infer_attention_score(
query=query,
key=key,
value=value,
atten_mask=attn_metadata.attn_mask,
block_table=attn_metadata.block_tables,
input_layout="TND",
block_size=block_size,
actual_seq_lengths=attn_metadata.actual_seq_lengths_q,
actual_seq_lengths_kv=attn_metadata.seq_lens_list,
num_key_value_heads=self.num_kv_heads,
num_heads=self.num_heads,
scale=self.scale,
sparse_mode=3,
)
output, _ = torch_npu.npu_fused_infer_attention_score(
query=query,
key=key,
value=value,
atten_mask=attn_metadata.attn_mask,
block_table=attn_metadata.block_tables,
input_layout="TND",
block_size=block_size,
actual_seq_lengths=attn_metadata.actual_seq_lengths_q,
actual_seq_lengths_kv=attn_metadata.seq_lens_list,
num_key_value_heads=self.num_kv_heads,
num_heads=self.num_heads,
scale=self.scale,
sparse_mode=3,
)
else:
torch_npu._npu_paged_attention_splitfuse(
query=query,
key_cache=self.key_cache,
value_cache=self.value_cache,
mask=attn_metadata.attn_mask,
block_table=attn_metadata.block_tables,
seq_len=attn_metadata.query_lens,
context_lens=attn_metadata.seq_lens,
num_kv_heads=self.num_kv_heads,
num_heads=self.num_heads,
scale_value=self.scale,
out=output)
return output
def _attention_with_nomask_and_mask(self, q: torch.Tensor,
@@ -1155,12 +1142,11 @@ class AscendAttentionBackendImpl(AttentionImpl):
query, attn_metadata, output)
# Normal V1 situation.
else:
if torch.version.cann.startswith("8.3"):
# npu_fused_infer_attention_score does not support cases
# where query.shape[0] != attn_metadata.query_start_loc[-1].
# Thus we need unpad it here.
num_tokens = attn_metadata.query_start_loc[-1]
query = query[:num_tokens]
# npu_fused_infer_attention_score does not support cases
# where query.shape[0] != attn_metadata.query_start_loc[-1].
# Thus we need unpad it here.
num_tokens = attn_metadata.query_start_loc[-1]
query = query[:num_tokens]
intermediate_output = self._forward_v1_style(
query, attn_metadata, output)