[Refactor] move the metadata from attention_v1 to util(ready for extract common_cp) & realize Ascendmetadata inherit from the parent class. (#5203)
RFC: https://github.com/vllm-project/vllm-ascend/issues/4629
1. Remove the pcp-related code from attention_v1.
2. Establish the inheritance relationship of CommonAttentionMetadata.
TODO
1. extract common_cp
2. move cp metadata to common_cp.
3. remove commonAttentionMetadata for aclgraph.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
@@ -1045,7 +1045,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
max_query_len=max_num_scheduled_tokens,
|
||||
decode_token_per_req=self.decode_token_per_req,
|
||||
prefill_context_parallel_metadata=long_seq_metadata,
|
||||
)
|
||||
max_seq_len=0)
|
||||
|
||||
if self.speculative_config and self.pcp_size * self.dcp_size > 1:
|
||||
# For pcp + spec decode, we flatten block_table
|
||||
@@ -1874,7 +1874,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
max_query_len=max_query_len,
|
||||
decode_token_per_req=self.decode_token_per_req,
|
||||
prefill_context_parallel_metadata=long_seq_metadata,
|
||||
)
|
||||
max_seq_len=0)
|
||||
if self.pcp_size * self.dcp_size > 1:
|
||||
common_attn_metadata.block_table_tensor = \
|
||||
block_table_tensor[:num_reqs * self.decode_threshold]
|
||||
|
||||
@@ -53,6 +53,7 @@ def build_attn_metadata(
|
||||
"""Build attention metadata for Ascend NPUs."""
|
||||
# TODO(Ronald1995): optimize AscendCommonAttentionMetadata.
|
||||
max_query_len = int(query_start_loc_cpu.max())
|
||||
max_seq_len = int(seq_lens_cpu.max())
|
||||
|
||||
attn_metadata: dict[str, Any] = {}
|
||||
kv_cache_groups = kv_cache_config.kv_cache_groups
|
||||
@@ -80,7 +81,7 @@ def build_attn_metadata(
|
||||
graph_pad_size=graph_pad_size,
|
||||
num_input_tokens=num_input_tokens,
|
||||
prefill_context_parallel_metadata=prefill_context_parallel_metadata,
|
||||
)
|
||||
max_seq_len=max_seq_len)
|
||||
|
||||
attn_metadata_builder = attn_metadata_builders[i]
|
||||
metadata = attn_metadata_builder.build(
|
||||
|
||||
Reference in New Issue
Block a user