[Refactor] move the metadata from attention_v1 to util(ready for extract common_cp) & realize Ascendmetadata inherit from the parent class. (#5203)
RFC: https://github.com/vllm-project/vllm-ascend/issues/4629
1. Remove the pcp-related code from attention_v1.
2. Establish the inheritance relationship of CommonAttentionMetadata.
TODO
1. extract common_cp
2. move cp metadata to common_cp.
3. remove commonAttentionMetadata for aclgraph.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
@@ -117,7 +117,8 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.ChunkedPrefill,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
seq_lens=None,
|
||||
max_seq_len=6)
|
||||
mock_model = MagicMock()
|
||||
|
||||
self.builder.build(1, common_attn_metadata, mock_model)
|
||||
|
||||
@@ -606,7 +606,8 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.PrefillNoCache,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
seq_lens=None,
|
||||
max_seq_len=6)
|
||||
|
||||
base_inputs = {
|
||||
"num_actual_tokens": 10,
|
||||
@@ -673,7 +674,8 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.ChunkedPrefill,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
seq_lens=None,
|
||||
max_seq_len=6)
|
||||
|
||||
base_inputs = {
|
||||
"num_actual_tokens": 15,
|
||||
@@ -729,7 +731,8 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.DecodeOnly,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
seq_lens=None,
|
||||
max_seq_len=6)
|
||||
|
||||
base_inputs = {
|
||||
"num_actual_tokens": 3,
|
||||
@@ -784,7 +787,8 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.DecodeOnly,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
seq_lens=None,
|
||||
max_seq_len=6)
|
||||
|
||||
base_inputs = {
|
||||
"num_actual_tokens": 3,
|
||||
@@ -839,7 +843,8 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.PrefillNoCache,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
seq_lens=None,
|
||||
max_seq_len=6)
|
||||
|
||||
builder = AscendMLAMetadataBuilder(kv_cache_spec=self.kv_cache_spec,
|
||||
layer_names=["layer_0", "layer_1"],
|
||||
|
||||
Reference in New Issue
Block a user