[Refactor] move the metadata from attention_v1 to util(ready for extract common_cp) & realize Ascendmetadata inherit from the parent class. (#5203)
RFC: https://github.com/vllm-project/vllm-ascend/issues/4629
1. Remove the pcp-related code from attention_v1.
2. Establish the inheritance relationship of CommonAttentionMetadata.
TODO
1. extract common_cp
2. move cp metadata to common_cp.
3. remove commonAttentionMetadata for aclgraph.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
@@ -268,7 +268,7 @@ class MtpProposer(Proposer):
|
||||
spec_attn_mask=self.runner.spec_attn_mask,
|
||||
attn_state=self.runner.attn_state,
|
||||
decode_token_per_req=self.runner.decode_token_per_req,
|
||||
)
|
||||
max_seq_len=0)
|
||||
if self.pcp_size * self.dcp_size > 1:
|
||||
# update long_seq related params and flatten block_table
|
||||
common_attn_metadata.prefill_context_parallel_metadata = \
|
||||
@@ -599,7 +599,7 @@ class MtpProposer(Proposer):
|
||||
spec_attn_mask=self.runner.spec_attn_mask,
|
||||
attn_state=self.runner.attn_state,
|
||||
decode_token_per_req=self.runner.decode_token_per_req,
|
||||
)
|
||||
max_seq_len=0)
|
||||
return spec_common_attn_metadata, token_indices
|
||||
|
||||
def _propose(
|
||||
@@ -1221,7 +1221,8 @@ class MtpProposer(Proposer):
|
||||
decode_token_per_req=self.runner.decode_token_per_req,
|
||||
num_computed_tokens_cpu=common_attn_metadata.
|
||||
num_computed_tokens_cpu,
|
||||
seq_lens=common_attn_metadata.seq_lens)
|
||||
seq_lens=common_attn_metadata.seq_lens,
|
||||
max_seq_len=0)
|
||||
|
||||
query_start_loc = common_attn_metadata.query_start_loc[
|
||||
1:1 + num_rejected_tokens_gpu.shape[0]]
|
||||
|
||||
Reference in New Issue
Block a user