Upgrade CANN to 8.3.rc1 (#3945)
### What this PR does / why we need it?
This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version
check logic.
TODO: we notice that UT runs failed with CANN 8.3 image. So the base
image for UT is still 8.2. We'll fix it later.
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -74,11 +74,10 @@ class TestAttentionMaskBuilder(TestBase):
|
||||
attn_mask = attention_mask_builder.get_attn_mask(
|
||||
max_seq_len=2048, dtype=torch.float16, device=torch.device("cpu"))
|
||||
self.assertEqual(attn_mask.shape, (2048, 2048))
|
||||
self.assertEqual(attn_mask[0][-1],
|
||||
torch.tensor(float("-inf"), dtype=torch.float16))
|
||||
self.assertEqual(attention_mask_builder._seq_len_cached, 2048)
|
||||
self.assertEqual(attn_mask[0][-1], torch.tensor(True))
|
||||
self.assertEqual(attention_mask_builder._seq_len_cached, 1024)
|
||||
self.assertEqual(attention_mask_builder.attn_mask_cache.shape,
|
||||
(2048, 2048))
|
||||
(1024, 1024))
|
||||
self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1],
|
||||
torch.tensor(float("-inf"), dtype=torch.float16))
|
||||
|
||||
@@ -91,43 +90,5 @@ class TestAttentionMaskBuilder(TestBase):
|
||||
dtype=torch.float16,
|
||||
device=torch.device("cpu"),
|
||||
)
|
||||
self.assertEqual(attn_mask.shape, (6, 100))
|
||||
self.assertEqual(attn_mask.shape, (2048, 2048))
|
||||
self.assertEqual(attention_mask_builder._seq_len_cached, 1024)
|
||||
|
||||
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
|
||||
seq_lens=torch.tensor([10, 3000, 2000]),
|
||||
position=torch.tensor([7, 8, 9, 2999, 1999]),
|
||||
dtype=torch.float16,
|
||||
device=torch.device("cpu"),
|
||||
)
|
||||
self.assertEqual(attn_mask.shape, (5, 3000))
|
||||
self.assertEqual(attention_mask_builder._seq_len_cached, 3000)
|
||||
|
||||
# splitfuse_attn_mask now only supports data types: torch.float16 and torch.bfloat16
|
||||
# otherwise raise ValueError
|
||||
with self.assertRaises(ValueError):
|
||||
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
|
||||
seq_lens=torch.tensor([10, 20, 100]),
|
||||
position=torch.tensor([7, 8, 9, 18, 19, 99]),
|
||||
dtype=torch.int8,
|
||||
device=torch.device("cpu"),
|
||||
)
|
||||
|
||||
def test_mask_value_cleanliness(self):
|
||||
attention_mask_builder = AttentionMaskBuilder(max_seq_len=6,
|
||||
dtype=torch.bfloat16)
|
||||
self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1],
|
||||
torch.tensor(1, dtype=torch.bfloat16))
|
||||
|
||||
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
|
||||
seq_lens=torch.tensor([6]),
|
||||
position=torch.tensor([3, 4, 5]),
|
||||
dtype=torch.bfloat16,
|
||||
device=torch.device("cpu"),
|
||||
)
|
||||
self.assertEqual(
|
||||
attn_mask[-2][-1],
|
||||
torch.tensor(-10000, dtype=torch.bfloat16,
|
||||
device=attn_mask.device))
|
||||
self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1],
|
||||
torch.tensor(1, dtype=torch.bfloat16))
|
||||
|
||||
@@ -298,8 +298,9 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('torch_npu._npu_flash_attention_qlens')
|
||||
def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens,
|
||||
@patch('torch_npu.npu_fused_infer_attention_score')
|
||||
def test_forward_prefill_cache_hit(self,
|
||||
mock_npu_fused_infer_attention_score,
|
||||
mock_npu_reshape_and_cache):
|
||||
"""Test forward pass in PrefillCacheHit state"""
|
||||
query = torch.randn(10, 8 * 64)
|
||||
@@ -308,6 +309,8 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||
output = torch.empty_like(query)
|
||||
|
||||
mock_npu_fused_infer_attention_score.return_value = (output, 1)
|
||||
|
||||
metadata = self.attn_metadata
|
||||
metadata.attn_state = AscendAttentionState.PrefillCacheHit
|
||||
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
||||
@@ -323,7 +326,7 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
output = self.impl.forward(layer, query, key, value, kv_cache,
|
||||
metadata, output)
|
||||
|
||||
mock_flash_attention_qlens.assert_called_once()
|
||||
mock_npu_fused_infer_attention_score.assert_called_once()
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
|
||||
@@ -528,13 +531,11 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('torch.version')
|
||||
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
|
||||
def test_forward_head_size_192(self, mock_vanilla_prefill,
|
||||
mock_npu_reshape_and_cache, mock_is_310p,
|
||||
mock_version):
|
||||
mock_npu_reshape_and_cache, mock_is_310p):
|
||||
"""Test forward pass when head_size is 192"""
|
||||
|
||||
self.impl.head_size = 192
|
||||
@@ -554,7 +555,6 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
metadata.num_decodes = 10
|
||||
metadata.num_prefills = 0
|
||||
layer = self.layer_no_quant
|
||||
mock_version.cann = "8.4.RC1"
|
||||
mock_vanilla_prefill.return_value = MagicMock()
|
||||
|
||||
output = self.impl_192.forward(layer, query, key, value, kv_cache,
|
||||
@@ -563,12 +563,11 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
mock_vanilla_prefill.assert_called_once()
|
||||
assert output.shape == (10, 8 * 192)
|
||||
|
||||
@patch('torch.version')
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('torch_npu._npu_paged_attention_splitfuse')
|
||||
def test_forward_normal_v1_situation(self, mock_paged_attention,
|
||||
mock_npu_reshape_and_cache,
|
||||
mock_version):
|
||||
@patch('torch_npu.npu_fused_infer_attention_score')
|
||||
def test_forward_normal_v1_situation(self,
|
||||
mock_npu_fused_infer_attention_score,
|
||||
mock_npu_reshape_and_cache):
|
||||
"""Test forward pass in normal V1 situation"""
|
||||
query = torch.randn(10, 8 * 64)
|
||||
key = torch.randn(10, 8 * 64)
|
||||
@@ -576,6 +575,8 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||
output = torch.empty_like(query)
|
||||
|
||||
mock_npu_fused_infer_attention_score.return_value = (output, 1)
|
||||
|
||||
metadata = self.attn_metadata
|
||||
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
||||
metadata.query_lens = torch.tensor([10])
|
||||
@@ -587,22 +588,20 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
metadata.num_prefills = 10
|
||||
layer = self.layer_no_quant
|
||||
|
||||
mock_version.cann = "8.4.RC1"
|
||||
|
||||
output = self.impl.forward(layer, query, key, value, kv_cache,
|
||||
metadata, output)
|
||||
|
||||
mock_paged_attention.assert_called_once()
|
||||
mock_npu_fused_infer_attention_score.assert_called_once()
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('torch.version')
|
||||
@patch('torch_npu.npu_format_cast')
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('torch_npu._npu_paged_attention_splitfuse')
|
||||
@patch('torch_npu.npu_fused_infer_attention_score')
|
||||
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
|
||||
def test_forward_310p_device(self, mock_is_310p, mock_paged_attention,
|
||||
def test_forward_310p_device(self, mock_is_310p,
|
||||
mock_npu_fused_infer_attention_score,
|
||||
mock_npu_reshape_and_cache,
|
||||
mock_npu_format_cast, mock_version):
|
||||
mock_npu_format_cast):
|
||||
"""Test forward pass on 310P device"""
|
||||
query = torch.randn(10, 8 * 64)
|
||||
key = torch.randn(10, 8 * 64)
|
||||
@@ -610,6 +609,8 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||
output = torch.empty_like(query)
|
||||
|
||||
mock_npu_fused_infer_attention_score.return_value = (output, 1)
|
||||
|
||||
metadata = self.attn_metadata
|
||||
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
||||
metadata.query_lens = torch.tensor([10])
|
||||
@@ -622,12 +623,11 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
layer = self.layer_no_quant
|
||||
|
||||
mock_npu_format_cast.return_value = metadata.attn_mask
|
||||
mock_version.cann = "8.4.RC1"
|
||||
|
||||
output = self.impl.forward(layer, query, key, value, kv_cache,
|
||||
metadata, output)
|
||||
|
||||
mock_paged_attention.assert_called_once()
|
||||
mock_npu_fused_infer_attention_score.assert_called_once()
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
|
||||
@@ -63,33 +63,20 @@ class TestAscendUnquantizedLinearMethod(TestBase):
|
||||
|
||||
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
||||
@mock.patch("torch_npu.npu_format_cast")
|
||||
@mock.patch("torch.version")
|
||||
def test_process_weights_after_loading_is_8_3_enable_nz(
|
||||
self, mock_version, mock_format_cast, mock_is_nz):
|
||||
mock_version.cann = "8.3.RC1"
|
||||
def test_process_weights_after_loading_enable_nz(self, mock_format_cast,
|
||||
mock_is_nz):
|
||||
mock_is_nz.return_value = 1
|
||||
self.method.process_weights_after_loading(self.layer)
|
||||
mock_format_cast.assert_called_once()
|
||||
|
||||
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
||||
@mock.patch("torch_npu.npu_format_cast")
|
||||
@mock.patch("torch.version")
|
||||
def test_process_weights_after_loading_is_8_3_disable_nz(
|
||||
self, mock_version, mock_format_cast, mock_is_nz):
|
||||
mock_version.cann = "8.3.RC1"
|
||||
def test_process_weights_after_loading_disable_nz(self, mock_format_cast,
|
||||
mock_is_nz):
|
||||
mock_is_nz.return_value = 0
|
||||
self.method.process_weights_after_loading(self.layer)
|
||||
mock_format_cast.assert_not_called()
|
||||
|
||||
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
||||
@mock.patch("torch.version")
|
||||
def test_process_weights_after_loading_not_8_3(self, mock_version,
|
||||
mock_is_nz):
|
||||
mock_version.cann = "8.2.RC1"
|
||||
mock_is_nz.return_value = 1
|
||||
# Should not raise exception
|
||||
self.method.process_weights_after_loading(self.layer)
|
||||
|
||||
|
||||
class TestAscendRowParallelLinear(BaseLinearTest):
|
||||
|
||||
|
||||
Reference in New Issue
Block a user