Fix some ci issue and refactor modelrunner (#2445)
### What this PR does / why we need it?
Fix some ci issue and refactor modelrunner
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.10.0
- vLLM main:
4d9c61993a
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
Co-authored-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
@@ -9,6 +9,7 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
|
||||
AscendAttentionState,
|
||||
AscendMetadata,
|
||||
CommonAttentionState)
|
||||
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
|
||||
|
||||
class TestAscendAttentionBackend(TestBase):
|
||||
@@ -67,8 +68,12 @@ class TestAscendAttentionBackend(TestBase):
|
||||
class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
|
||||
def setUp(self):
|
||||
self.mock_runner = MagicMock()
|
||||
self.builder = AscendAttentionMetadataBuilder(self.mock_runner)
|
||||
self.mock_vllm_config = MagicMock()
|
||||
self.mock_vllm_config.model_config.max_model_len = 640
|
||||
self.mock_vllm_config.cache_config.block_size = 64
|
||||
self.mock_device = 'cpu:0'
|
||||
self.builder = AscendAttentionMetadataBuilder(self.mock_vllm_config,
|
||||
self.mock_device)
|
||||
|
||||
def test_reorder_batch(self):
|
||||
mock_input_batch = MagicMock()
|
||||
@@ -86,31 +91,28 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d,
|
||||
mock_npu_format_cast,
|
||||
mock_ascend_metadata):
|
||||
num_reqs = 2
|
||||
num_actual_tokens = 10
|
||||
max_query_len = 5
|
||||
|
||||
self.mock_runner.input_batch.block_table = [MagicMock()]
|
||||
self.mock_runner.input_batch.block_table[
|
||||
0].get_device_tensor.return_value = torch.zeros((10, 10))
|
||||
self.mock_runner.max_num_blocks_per_req = 10
|
||||
self.mock_runner.query_lens = torch.tensor([3, 4])
|
||||
self.mock_runner.seq_lens_cpu = torch.tensor([5, 6])
|
||||
self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
|
||||
self.mock_runner.device = 'cpu:0'
|
||||
self.mock_runner.attn_mask = torch.ones((10, 10))
|
||||
self.mock_runner.attn_state = AscendAttentionState.PrefillNoCache
|
||||
self.mock_runner.query_start_loc_cpu = torch.tensor([0, 3, 7])
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 3, 7]),
|
||||
query_start_loc_cpu=torch.tensor([0, 3, 7]),
|
||||
seq_lens_cpu=torch.tensor([5, 6]),
|
||||
num_reqs=2,
|
||||
num_actual_tokens=10,
|
||||
max_query_len=5,
|
||||
decode_token_per_req=torch.tensor([1, 1]),
|
||||
block_table_tensor=torch.zeros((10, 10)),
|
||||
slot_mapping_cpu=torch.tensor(range(20)),
|
||||
actual_seq_lengths_q=torch.tensor([0, 1]),
|
||||
positions=torch.tensor([10, 10]),
|
||||
attn_mask=torch.ones((10, 10)),
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.PrefillNoCache)
|
||||
|
||||
mock_nz_tensor = MagicMock()
|
||||
mock_model = MagicMock()
|
||||
mock_nd_to_nz_2d.return_value = mock_nz_tensor
|
||||
mock_npu_format_cast.return_value = mock_nz_tensor
|
||||
|
||||
self.builder.build(
|
||||
num_reqs,
|
||||
num_actual_tokens,
|
||||
max_query_len,
|
||||
)
|
||||
self.builder.build(common_attn_metadata, mock_model)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
|
||||
@patch('torch_npu.npu_format_cast')
|
||||
@@ -120,51 +122,53 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
def test_build_chunked_prefill(self, mock_ascend_attention_state,
|
||||
mock_is_310p, mock_nd_to_nz_spec,
|
||||
mock_npu_format_cast, mock_ascend_metadata):
|
||||
num_reqs = 3
|
||||
num_actual_tokens = 15
|
||||
max_query_len = 6
|
||||
|
||||
self.mock_runner.input_batch.block_table = [MagicMock()]
|
||||
self.mock_runner.input_batch.block_table[
|
||||
0].get_device_tensor.return_value = torch.zeros((10, 10))
|
||||
self.mock_runner.max_num_blocks_per_req = 10
|
||||
self.mock_runner.query_lens = torch.tensor([2, 3, 4])
|
||||
self.mock_runner.seq_lens_cpu = torch.tensor([4, 5, 6])
|
||||
self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
|
||||
self.mock_runner.device = 'cpu:0'
|
||||
self.mock_runner.attn_mask = torch.ones((15, 15))
|
||||
self.mock_runner.attn_state = AscendAttentionState.ChunkedPrefill
|
||||
self.mock_runner.query_start_loc_cpu = torch.tensor([0, 2, 5, 9])
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 2, 5, 9]),
|
||||
query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
|
||||
seq_lens_cpu=torch.tensor([4, 5, 6]),
|
||||
num_reqs=3,
|
||||
num_actual_tokens=15,
|
||||
max_query_len=6,
|
||||
decode_token_per_req=torch.tensor([1, 1, 1]),
|
||||
block_table_tensor=torch.zeros((10, 10)),
|
||||
slot_mapping_cpu=torch.tensor(range(20)),
|
||||
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
|
||||
positions=torch.tensor([10, 10]),
|
||||
attn_mask=torch.ones((15, 15)),
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.ChunkedPrefill)
|
||||
|
||||
mock_ascend_attention_state = MagicMock()
|
||||
mock_ascend_attention_state.PrefillNoCache = 0
|
||||
|
||||
mock_nz_tensor = MagicMock()
|
||||
mock_model = MagicMock()
|
||||
mock_nd_to_nz_spec.return_value = mock_nz_tensor
|
||||
mock_npu_format_cast.return_value = mock_nz_tensor
|
||||
|
||||
self.builder.build(num_reqs, num_actual_tokens, max_query_len)
|
||||
self.builder.build(common_attn_metadata, mock_model)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
|
||||
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
|
||||
def test_build_non_310p(self, mock_is_310p, mock_ascend_metadata):
|
||||
num_reqs = 3
|
||||
num_actual_tokens = 15
|
||||
max_query_len = 6
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 2, 5, 9]),
|
||||
query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
|
||||
seq_lens_cpu=torch.tensor([4, 5, 6]),
|
||||
num_reqs=3,
|
||||
num_actual_tokens=15,
|
||||
max_query_len=6,
|
||||
decode_token_per_req=torch.tensor([1, 1, 1]),
|
||||
block_table_tensor=torch.zeros((10, 10)),
|
||||
slot_mapping_cpu=torch.tensor(range(20)),
|
||||
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
|
||||
positions=torch.tensor([10, 10]),
|
||||
attn_mask=torch.ones((15, 15)),
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.ChunkedPrefill)
|
||||
mock_model = MagicMock()
|
||||
|
||||
self.mock_runner.input_batch.block_table = [MagicMock()]
|
||||
self.mock_runner.input_batch.block_table[
|
||||
0].get_device_tensor.return_value = torch.zeros((10, 10))
|
||||
self.mock_runner.max_num_blocks_per_req = 10
|
||||
self.mock_runner.query_lens = torch.tensor([2, 3, 4])
|
||||
self.mock_runner.seq_lens_cpu = torch.tensor([4, 5, 6])
|
||||
self.mock_runner.slot_mapping_cpu = torch.tensor(range(20))
|
||||
self.mock_runner.device = 'cpu:0'
|
||||
self.mock_runner.attn_mask = torch.ones((15, 15))
|
||||
self.mock_runner.attn_state = AscendAttentionState.ChunkedPrefill
|
||||
self.mock_runner.query_start_loc_cpu = torch.tensor([0, 2, 5, 9])
|
||||
|
||||
self.builder.build(num_reqs, num_actual_tokens, max_query_len)
|
||||
self.builder.build(common_attn_metadata, mock_model)
|
||||
|
||||
|
||||
class TestAscendAttentionBackendImpl(TestBase):
|
||||
|
||||
Reference in New Issue
Block a user