Upgrade CANN to 8.3.rc1 (#3945)

### What this PR does / why we need it?
This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version
check logic.

TODO: we notice that UT runs failed with CANN 8.3 image. So the base
image for UT is still 8.2. We'll fix it later.


- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-03 20:21:07 +08:00
committed by GitHub
parent 49d74785c4
commit cc2cd42ad3
39 changed files with 119 additions and 213 deletions

View File

@@ -298,8 +298,9 @@ class TestAscendAttentionBackendImpl(TestBase):
assert output.shape == (10, 8 * 64)
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_flash_attention_qlens')
def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens,
@patch('torch_npu.npu_fused_infer_attention_score')
def test_forward_prefill_cache_hit(self,
mock_npu_fused_infer_attention_score,
mock_npu_reshape_and_cache):
"""Test forward pass in PrefillCacheHit state"""
query = torch.randn(10, 8 * 64)
@@ -308,6 +309,8 @@ class TestAscendAttentionBackendImpl(TestBase):
kv_cache = torch.empty(2, 5, 128, 8, 64)
output = torch.empty_like(query)
mock_npu_fused_infer_attention_score.return_value = (output, 1)
metadata = self.attn_metadata
metadata.attn_state = AscendAttentionState.PrefillCacheHit
metadata.attn_mask = torch.randn(1, 1, 10, 10)
@@ -323,7 +326,7 @@ class TestAscendAttentionBackendImpl(TestBase):
output = self.impl.forward(layer, query, key, value, kv_cache,
metadata, output)
mock_flash_attention_qlens.assert_called_once()
mock_npu_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@@ -528,13 +531,11 @@ class TestAscendAttentionBackendImpl(TestBase):
assert output.shape == (10, 8 * 64)
@patch('torch.version')
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
@patch('torch_npu._npu_reshape_and_cache')
@patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
def test_forward_head_size_192(self, mock_vanilla_prefill,
mock_npu_reshape_and_cache, mock_is_310p,
mock_version):
mock_npu_reshape_and_cache, mock_is_310p):
"""Test forward pass when head_size is 192"""
self.impl.head_size = 192
@@ -554,7 +555,6 @@ class TestAscendAttentionBackendImpl(TestBase):
metadata.num_decodes = 10
metadata.num_prefills = 0
layer = self.layer_no_quant
mock_version.cann = "8.4.RC1"
mock_vanilla_prefill.return_value = MagicMock()
output = self.impl_192.forward(layer, query, key, value, kv_cache,
@@ -563,12 +563,11 @@ class TestAscendAttentionBackendImpl(TestBase):
mock_vanilla_prefill.assert_called_once()
assert output.shape == (10, 8 * 192)
@patch('torch.version')
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_paged_attention_splitfuse')
def test_forward_normal_v1_situation(self, mock_paged_attention,
mock_npu_reshape_and_cache,
mock_version):
@patch('torch_npu.npu_fused_infer_attention_score')
def test_forward_normal_v1_situation(self,
mock_npu_fused_infer_attention_score,
mock_npu_reshape_and_cache):
"""Test forward pass in normal V1 situation"""
query = torch.randn(10, 8 * 64)
key = torch.randn(10, 8 * 64)
@@ -576,6 +575,8 @@ class TestAscendAttentionBackendImpl(TestBase):
kv_cache = torch.empty(2, 5, 128, 8, 64)
output = torch.empty_like(query)
mock_npu_fused_infer_attention_score.return_value = (output, 1)
metadata = self.attn_metadata
metadata.attn_mask = torch.randn(1, 1, 10, 10)
metadata.query_lens = torch.tensor([10])
@@ -587,22 +588,20 @@ class TestAscendAttentionBackendImpl(TestBase):
metadata.num_prefills = 10
layer = self.layer_no_quant
mock_version.cann = "8.4.RC1"
output = self.impl.forward(layer, query, key, value, kv_cache,
metadata, output)
mock_paged_attention.assert_called_once()
mock_npu_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('torch.version')
@patch('torch_npu.npu_format_cast')
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_paged_attention_splitfuse')
@patch('torch_npu.npu_fused_infer_attention_score')
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
def test_forward_310p_device(self, mock_is_310p, mock_paged_attention,
def test_forward_310p_device(self, mock_is_310p,
mock_npu_fused_infer_attention_score,
mock_npu_reshape_and_cache,
mock_npu_format_cast, mock_version):
mock_npu_format_cast):
"""Test forward pass on 310P device"""
query = torch.randn(10, 8 * 64)
key = torch.randn(10, 8 * 64)
@@ -610,6 +609,8 @@ class TestAscendAttentionBackendImpl(TestBase):
kv_cache = torch.empty(2, 5, 128, 8, 64)
output = torch.empty_like(query)
mock_npu_fused_infer_attention_score.return_value = (output, 1)
metadata = self.attn_metadata
metadata.attn_mask = torch.randn(1, 1, 10, 10)
metadata.query_lens = torch.tensor([10])
@@ -622,12 +623,11 @@ class TestAscendAttentionBackendImpl(TestBase):
layer = self.layer_no_quant
mock_npu_format_cast.return_value = metadata.attn_mask
mock_version.cann = "8.4.RC1"
output = self.impl.forward(layer, query, key, value, kv_cache,
metadata, output)
mock_paged_attention.assert_called_once()
mock_npu_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('torch_npu._npu_reshape_and_cache')