We'll release 0.13.0 soon. The main branch is freeze. Let's revert the
newest change and redo it once 0.13.0 is released
- vLLM version: release/v0.13.0
- vLLM main:
81786c8774
This commit is contained in:
@@ -865,7 +865,7 @@ class TestAscendMLAImpl(TestBase):
|
||||
q_head_idx, q_tail_idx, kv_with_q_head_nomask_idx, kv_with_q_head_mask_idx, kv_with_q_tail_nomask_idx, \
|
||||
kv_with_q_tail_mask_idx, chunk_seqlens, kv_with_q_head_nomask_seqlens, kv_with_q_tail_nomask_seqlens = get_pcp_split_info(
|
||||
rank, pcp_size, nums_tokens_per_rank)
|
||||
kv_with_q_head_nomask_idx = [kv_with_q_head_nomask_idx]
|
||||
|
||||
output_head, lse_head = self.impl._attention_with_mask_and_nomask(
|
||||
q_nope=torch.index_select(q_nope, 0, q_head_idx),
|
||||
q_pe=torch.index_select(q_pe, 0, q_head_idx),
|
||||
@@ -876,16 +876,15 @@ class TestAscendMLAImpl(TestBase):
|
||||
kv_nomask_idx=kv_with_q_head_nomask_idx,
|
||||
attn_mask_seqlens=torch.tensor(
|
||||
[chunk_seqlens, chunk_seqlens], dtype=torch.int32),
|
||||
attn_nomask_seqlens=[kv_with_q_head_nomask_seqlens],
|
||||
attn_nomask_seqlens=kv_with_q_head_nomask_seqlens,
|
||||
mask=mask)
|
||||
self.assertEqual(output_head.shape,
|
||||
(q_head_idx.shape[0], num_heads, v_head_dim))
|
||||
self.assertEqual(lse_head.shape,
|
||||
(num_heads, q_head_idx.shape[0]))
|
||||
self.assertEqual(mock_npu_ring_mla.call_count,
|
||||
1 + (len(kv_with_q_head_nomask_idx[0]) != 0))
|
||||
1 + (kv_with_q_head_nomask_idx.shape[0] != 0))
|
||||
mock_npu_ring_mla.reset_mock()
|
||||
kv_with_q_tail_nomask_idx = [kv_with_q_tail_nomask_idx]
|
||||
output_tail, lse_tail = self.impl._attention_with_mask_and_nomask(
|
||||
q_nope=torch.index_select(q_nope, 0, q_tail_idx),
|
||||
q_pe=torch.index_select(q_pe, 0, q_tail_idx),
|
||||
@@ -896,7 +895,7 @@ class TestAscendMLAImpl(TestBase):
|
||||
kv_nomask_idx=kv_with_q_tail_nomask_idx,
|
||||
attn_mask_seqlens=torch.tensor(
|
||||
[chunk_seqlens, chunk_seqlens], dtype=torch.int32),
|
||||
attn_nomask_seqlens=[kv_with_q_tail_nomask_seqlens],
|
||||
attn_nomask_seqlens=kv_with_q_tail_nomask_seqlens,
|
||||
mask=mask)
|
||||
|
||||
self.assertEqual(output_tail.shape,
|
||||
@@ -904,7 +903,7 @@ class TestAscendMLAImpl(TestBase):
|
||||
self.assertEqual(lse_tail.shape,
|
||||
(num_heads, q_tail_idx.shape[0]))
|
||||
self.assertEqual(mock_npu_ring_mla.call_count,
|
||||
1 + (len(kv_with_q_tail_nomask_idx[0]) != 0))
|
||||
1 + (kv_with_q_tail_nomask_idx.shape[0] != 0))
|
||||
mock_npu_ring_mla.reset_mock()
|
||||
|
||||
@patch("torch.distributed.all_to_all_single")
|
||||
|
||||
@@ -73,15 +73,6 @@ def test_generate_pcp_metadata_basic(pcp_size, dcp_size, num_reqs, query_lens,
|
||||
|
||||
mock_runner.query_lens = torch.tensor(query_lens)
|
||||
|
||||
mock_runner._get_cp_local_seq_lens.side_effect = NPUModelRunner._get_cp_local_seq_lens.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
mock_runner._list_to_tensor.side_effect = NPUModelRunner._list_to_tensor.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
mock_runner._split_nomask_idx_tensor_list.side_effect = NPUModelRunner._split_nomask_idx_tensor_list.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
mock_runner._split_multi_batch_kv_idx.side_effect = NPUModelRunner._split_multi_batch_kv_idx.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
|
||||
mock_runner._get_cp_local_seq_lens = NPUModelRunner._get_cp_local_seq_lens.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
|
||||
@@ -97,7 +88,9 @@ def test_generate_pcp_metadata_basic(pcp_size, dcp_size, num_reqs, query_lens,
|
||||
mock_runner.q_tail_idx_tensor = None
|
||||
mock_runner.q_full_idx = None
|
||||
|
||||
result = NPUModelRunner._generate_pcp_metadata(mock_runner, total_tokens)
|
||||
method = NPUModelRunner._generate_pcp_metadata.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
result = method(total_tokens)
|
||||
|
||||
if not expect_not_none:
|
||||
assert result is None, f"Expected to return None, but got {type(result)}"
|
||||
@@ -478,201 +471,3 @@ def test_generate_pcp_mtp_input(
|
||||
target_input_ids_pcp_full)
|
||||
assert torch.equal(mock_runner.query_start_loc_pcp_full.cpu[:num_reqs + 1],
|
||||
target_query_start_loc_pcp_full)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pcp_rank, split_with_q_head_nomask_idx_reqs, split_kv_with_q_tail_nomask_idx_reqs,"
|
||||
"head_attn_nomask_seqlens, chunk_seqlens,"
|
||||
"target_split_q_head, target_split_q_tail, target_head_seqlens, target_tail_seqlens",
|
||||
[
|
||||
# case1: pcp_rank=0
|
||||
(0, [[10, 20, 30]], [[40, 50, 60]],
|
||||
torch.tensor([[64], [0]], dtype=torch.int32), [64], [
|
||||
torch.tensor([1, 2, 3], dtype=torch.int32)
|
||||
], [torch.tensor([40, 50, 60], dtype=torch.int32)], [
|
||||
torch.tensor([[64], [0]], dtype=torch.int32)
|
||||
], [torch.tensor([[64], [3]], dtype=torch.int32)]),
|
||||
# case2: pcp_rank=1
|
||||
(1, [[1, 2], [3, 4, 5]], [[6, 7], [8, 9, 10]],
|
||||
torch.tensor([[128, 128], [128, 128]], dtype=torch.int32), [128, 128],
|
||||
[torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32)], [
|
||||
torch.tensor([6, 7, 8, 9, 10], dtype=torch.int32)
|
||||
], [torch.tensor([[128, 128], [2, 3]], dtype=torch.int32)
|
||||
], [torch.tensor([[128, 128], [2, 3]], dtype=torch.int32)]),
|
||||
# case3: pcp_rank=2
|
||||
(2, [[11, 12, 13, 14], [15, 16]], [[17, 18, 19], [20, 21, 22, 23]],
|
||||
torch.tensor([[256, 256], [512, 512]], dtype=torch.int32), [256, 256],
|
||||
[torch.tensor([11, 12, 13, 14, 15, 16], dtype=torch.int32)], [
|
||||
torch.tensor([17, 18, 19, 20, 21, 22, 23], dtype=torch.int32)
|
||||
], [torch.tensor([[256, 256], [4, 2]], dtype=torch.int32)
|
||||
], [torch.tensor([[256, 256], [3, 4]], dtype=torch.int32)]),
|
||||
# case4: empty input
|
||||
(
|
||||
0,
|
||||
[],
|
||||
[],
|
||||
torch.tensor([], dtype=torch.int32).reshape(2, 0),
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
),
|
||||
# case5: single element input
|
||||
(
|
||||
0,
|
||||
[[10]],
|
||||
[[40]],
|
||||
torch.tensor([[64], [0]], dtype=torch.int32),
|
||||
[64],
|
||||
[torch.tensor([1, 2, 3], dtype=torch.int32)],
|
||||
[torch.tensor([40], dtype=torch.int32)],
|
||||
[torch.tensor([[64], [0]], dtype=torch.int32)],
|
||||
[torch.tensor([[64], [1]], dtype=torch.int32)],
|
||||
),
|
||||
# case6: pcp_rank=3
|
||||
(
|
||||
3,
|
||||
[[1, 2], [3, 4, 5]],
|
||||
[[6, 7], [8, 9, 10]],
|
||||
torch.tensor([[128, 128], [128, 128]], dtype=torch.int32),
|
||||
[128, 128],
|
||||
[torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32)],
|
||||
[torch.tensor([6, 7, 8, 9, 10], dtype=torch.int32)],
|
||||
[torch.tensor([[128, 128], [2, 3]], dtype=torch.int32)],
|
||||
[torch.tensor([[128, 128], [2, 3]], dtype=torch.int32)],
|
||||
),
|
||||
])
|
||||
def test_split_nomask_idx_tensor_list(
|
||||
pcp_rank, split_with_q_head_nomask_idx_reqs,
|
||||
split_kv_with_q_tail_nomask_idx_reqs, head_attn_nomask_seqlens,
|
||||
chunk_seqlens, target_split_q_head, target_split_q_tail,
|
||||
target_head_seqlens, target_tail_seqlens):
|
||||
# Mock input data
|
||||
mock_runner = MagicMock(spec=NPUModelRunner)
|
||||
mock_runner.device = "cpu"
|
||||
mock_runner.pcp_rank = 0
|
||||
mock_runner.kv_idx_names = {
|
||||
"kv_with_q_head_nomask_idx_tensor":
|
||||
torch.tensor([1, 2, 3], dtype=torch.int32)
|
||||
}
|
||||
|
||||
mock_runner.pcp_rank = pcp_rank
|
||||
|
||||
# Mock output
|
||||
mock_runner._split_multi_batch_kv_idx.side_effect = NPUModelRunner._split_multi_batch_kv_idx.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
mock_runner._list_to_tensor.side_effect = NPUModelRunner._list_to_tensor.__get__(
|
||||
mock_runner, NPUModelRunner)
|
||||
|
||||
# Call the method under test
|
||||
result = NPUModelRunner._split_nomask_idx_tensor_list(
|
||||
mock_runner,
|
||||
split_with_q_head_nomask_idx_reqs=split_with_q_head_nomask_idx_reqs,
|
||||
split_kv_with_q_tail_nomask_idx_reqs=
|
||||
split_kv_with_q_tail_nomask_idx_reqs,
|
||||
head_attn_nomask_seqlens=head_attn_nomask_seqlens,
|
||||
chunk_seqlens=chunk_seqlens)
|
||||
split_q_head, split_q_tail, head_seqlens, tail_seqlens = result
|
||||
|
||||
# Assert the method call
|
||||
assert len(split_q_head) == len(target_split_q_head)
|
||||
for res, target in zip(split_q_head, target_split_q_head):
|
||||
assert torch.equal(res, target)
|
||||
|
||||
assert len(split_q_tail) == len(target_split_q_tail)
|
||||
for res, target in zip(split_q_tail, target_split_q_tail):
|
||||
assert torch.equal(res, target)
|
||||
|
||||
assert len(head_seqlens) == len(target_head_seqlens)
|
||||
for res, target in zip(head_seqlens, target_head_seqlens):
|
||||
if isinstance(target, torch.Tensor):
|
||||
assert torch.equal(res, target)
|
||||
else:
|
||||
assert res == target
|
||||
|
||||
assert len(tail_seqlens) == len(target_tail_seqlens)
|
||||
for res, target in zip(tail_seqlens, target_tail_seqlens):
|
||||
if isinstance(target, torch.Tensor):
|
||||
assert torch.equal(res, target)
|
||||
else:
|
||||
assert res == target
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kv_nomask_idx_multi_batch, split_size, expected_merged_idx, expected_merged_len",
|
||||
[
|
||||
# case1: multiple batches + split size greater than batch length
|
||||
(
|
||||
[[0, 1, 2, 3, 4], [5, 6, 7]],
|
||||
2,
|
||||
# expected merged_split_kv_idx_3d
|
||||
[[0, 1, 5, 6], [2, 3, 7], [4]],
|
||||
# expected merged_split_kv_len_2d
|
||||
[[2, 2], [2, 1], [1, 0]],
|
||||
),
|
||||
# case2: single batch + split size greater than batch length
|
||||
(
|
||||
[[0, 1, 2]],
|
||||
5,
|
||||
[[0, 1, 2]],
|
||||
[[3]],
|
||||
),
|
||||
# case3: split size equals maximum batch length
|
||||
(
|
||||
[[0, 1, 2, 3], [5, 6]],
|
||||
4,
|
||||
[[0, 1, 2, 3, 5, 6]],
|
||||
[[4, 2]],
|
||||
),
|
||||
# case4: Split size is 1 (minimum granularity split)
|
||||
(
|
||||
[[0, 1], [2]],
|
||||
1,
|
||||
[[0, 2], [1]],
|
||||
[[1, 1], [1, 0]],
|
||||
),
|
||||
# case6: the batch contains an empty list
|
||||
(
|
||||
[[], [0, 1], [2]],
|
||||
1,
|
||||
[[0, 2], [1]],
|
||||
[[0, 1, 1], [0, 1, 0]],
|
||||
),
|
||||
# case: empty input
|
||||
(
|
||||
[],
|
||||
2,
|
||||
[],
|
||||
[],
|
||||
),
|
||||
])
|
||||
def test_split_multi_batch_kv_idx(
|
||||
kv_nomask_idx_multi_batch,
|
||||
split_size,
|
||||
expected_merged_idx,
|
||||
expected_merged_len,
|
||||
):
|
||||
# Mock input data
|
||||
model_runner = MagicMock(spec=NPUModelRunner)
|
||||
|
||||
# Call the method under test
|
||||
result = NPUModelRunner._split_multi_batch_kv_idx(
|
||||
self=model_runner,
|
||||
kv_nomask_idx_multi_batch=kv_nomask_idx_multi_batch,
|
||||
split_size=split_size)
|
||||
|
||||
merged_split_kv_idx_3d, merged_split_kv_len_2d = result
|
||||
|
||||
# Assert the method call
|
||||
assert len(merged_split_kv_idx_3d) == len(expected_merged_idx)
|
||||
|
||||
for t, (actual_seg, expected_seg) in enumerate(
|
||||
zip(merged_split_kv_idx_3d, expected_merged_idx)):
|
||||
assert actual_seg == expected_seg
|
||||
|
||||
assert len(merged_split_kv_len_2d) == len(expected_merged_len)
|
||||
|
||||
for t, (actual_len, expected_len) in enumerate(
|
||||
zip(merged_split_kv_len_2d, expected_merged_len)):
|
||||
assert actual_len == expected_len
|
||||
|
||||
Reference in New Issue
Block a user