[cherry-pick]Upgrade CANN to 8.3.rc1 (#3945) (#3962)

This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version
check logic.

TODO: we notice that UT runs failed with CANN 8.3 image. So the base
image for UT is still 8.2. We'll fix it later.

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-11-06 09:05:08 +08:00
committed by GitHub
parent 66b67f9cf2
commit 7ee0b0b5d8
36 changed files with 104 additions and 192 deletions

View File

@@ -15,7 +15,7 @@ spec:
spec:
containers:
- name: vllm-leader
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
env:
- name: WORKSPACE
value: "/root/workspace"
@@ -70,7 +70,7 @@ spec:
spec:
containers:
- name: vllm-worker
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
env:
- name: WORKSPACE
value: "/root/workspace"

View File

@@ -1,2 +1,2 @@
# Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
BASE_IMAGE_NAME="quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
BASE_IMAGE_NAME="quay.io/ascend/cann:8.3.rc1-910b-ubuntu22.04-py3.11"

View File

@@ -91,43 +91,5 @@ class TestAttentionMaskBuilder(TestBase):
dtype=torch.float16,
device=torch.device("cpu"),
)
self.assertEqual(attn_mask.shape, (6, 100))
self.assertEqual(attn_mask.shape, (2048, 2048))
self.assertEqual(attention_mask_builder._seq_len_cached, 1024)
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
seq_lens=torch.tensor([10, 3000, 2000]),
position=torch.tensor([7, 8, 9, 2999, 1999]),
dtype=torch.float16,
device=torch.device("cpu"),
)
self.assertEqual(attn_mask.shape, (5, 3000))
self.assertEqual(attention_mask_builder._seq_len_cached, 3000)
# splitfuse_attn_mask now only supports data types: torch.float16 and torch.bfloat16
# otherwise raise ValueError
with self.assertRaises(ValueError):
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
seq_lens=torch.tensor([10, 20, 100]),
position=torch.tensor([7, 8, 9, 18, 19, 99]),
dtype=torch.int8,
device=torch.device("cpu"),
)
def test_mask_value_cleanliness(self):
attention_mask_builder = AttentionMaskBuilder(max_seq_len=6,
dtype=torch.bfloat16)
self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1],
torch.tensor(1, dtype=torch.bfloat16))
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
seq_lens=torch.tensor([6]),
position=torch.tensor([3, 4, 5]),
dtype=torch.bfloat16,
device=torch.device("cpu"),
)
self.assertEqual(
attn_mask[-2][-1],
torch.tensor(-10000, dtype=torch.bfloat16,
device=attn_mask.device))
self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1],
torch.tensor(1, dtype=torch.bfloat16))

View File

@@ -344,8 +344,9 @@ class TestAscendAttentionBackendImpl(TestBase):
assert output.shape == (10, 8 * 64)
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_flash_attention_qlens')
def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens,
@patch('torch_npu.npu_fused_infer_attention_score')
def test_forward_prefill_cache_hit(self,
mock_npu_fused_infer_attention_score,
mock_npu_reshape_and_cache):
"""Test forward pass in PrefillCacheHit state"""
query = torch.randn(10, 8 * 64)
@@ -370,7 +371,7 @@ class TestAscendAttentionBackendImpl(TestBase):
metadata,
trace_flag=False)
mock_flash_attention_qlens.assert_called_once()
mock_npu_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@@ -613,8 +614,9 @@ class TestAscendAttentionBackendImpl(TestBase):
assert output.shape == (10, 8 * 192)
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_paged_attention_splitfuse')
def test_forward_normal_v1_situation(self, mock_paged_attention,
@patch('torch_npu.npu_fused_infer_attention_score')
def test_forward_normal_v1_situation(self,
mock_npu_fused_infer_attention_score,
mock_npu_reshape_and_cache):
"""Test forward pass in normal V1 situation"""
query = torch.randn(10, 8 * 64)
@@ -638,14 +640,15 @@ class TestAscendAttentionBackendImpl(TestBase):
metadata,
trace_flag=False)
mock_paged_attention.assert_called_once()
mock_npu_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('torch_npu.npu_format_cast')
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_paged_attention_splitfuse')
@patch('torch_npu.npu_fused_infer_attention_score')
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
def test_forward_310p_device(self, mock_is_310p, mock_paged_attention,
def test_forward_310p_device(self, mock_is_310p,
mock_npu_fused_infer_attention_score,
mock_npu_reshape_and_cache,
mock_npu_format_cast):
"""Test forward pass on 310P device"""
@@ -671,7 +674,7 @@ class TestAscendAttentionBackendImpl(TestBase):
metadata,
trace_flag=False)
mock_paged_attention.assert_called_once()
mock_npu_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('torch_npu._npu_reshape_and_cache')

View File

@@ -63,33 +63,20 @@ class TestAscendUnquantizedLinearMethod(TestBase):
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
@mock.patch("torch_npu.npu_format_cast")
@mock.patch("torch.version")
def test_process_weights_after_loading_is_8_3_enable_nz(
self, mock_version, mock_format_cast, mock_is_nz):
mock_version.cann = "8.3.RC1"
def test_process_weights_after_loading_enable_nz(self, mock_format_cast,
mock_is_nz):
mock_is_nz.return_value = 1
self.method.process_weights_after_loading(self.layer)
mock_format_cast.assert_called_once()
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
@mock.patch("torch_npu.npu_format_cast")
@mock.patch("torch.version")
def test_process_weights_after_loading_is_8_3_disable_nz(
self, mock_version, mock_format_cast, mock_is_nz):
mock_version.cann = "8.3.RC1"
def test_process_weights_after_loading_disable_nz(self, mock_format_cast,
mock_is_nz):
mock_is_nz.return_value = 0
self.method.process_weights_after_loading(self.layer)
mock_format_cast.assert_not_called()
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
@mock.patch("torch.version")
def test_process_weights_after_loading_not_8_3(self, mock_version,
mock_is_nz):
mock_version.cann = "8.2.RC1"
mock_is_nz.return_value = 1
# Should not raise exception
self.method.process_weights_after_loading(self.layer)
class TestAscendRowParallelLinear(BaseLinearTest):