[Refactor] cache cos/sin in mla & remove parameter model in builder. (#5277)
RFC: https://github.com/vllm-project/vllm-ascend/issues/4629
1. Cache cos/sin in mla
2. AttentionBuilder inherits from the original class of vllm.
version: release/v0.13.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
This commit is contained in:
@@ -1122,21 +1122,10 @@ class NPUModelRunner(GPUModelRunner):
|
||||
num_decode_draft_tokens_cpu=self.
|
||||
num_decode_draft_tokens.cpu[:num_reqs],
|
||||
)
|
||||
attn_metadata_i = builder.build(
|
||||
common_prefix_len=common_prefix_len,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
**extra_attn_metadata_args)
|
||||
elif self.model_config.runner_type == "pooling":
|
||||
attn_metadata_i = builder.build(
|
||||
common_prefix_len=common_prefix_len,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
**extra_attn_metadata_args)
|
||||
else:
|
||||
attn_metadata_i = builder.build(
|
||||
common_prefix_len=common_prefix_len,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
model=self.get_model(),
|
||||
**extra_attn_metadata_args)
|
||||
attn_metadata_i = builder.build(
|
||||
common_prefix_len=common_prefix_len,
|
||||
common_attn_metadata=common_attn_metadata,
|
||||
**extra_attn_metadata_args)
|
||||
|
||||
for layer_name in attn_group.layer_names:
|
||||
attn_metadata[layer_name] = attn_metadata_i
|
||||
@@ -1918,7 +1907,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
common_metadata)
|
||||
else:
|
||||
attn_metadata_full_attention = builder.build_for_graph_capture(
|
||||
common_attn_metadata, attn_state, self.get_model())
|
||||
common_attn_metadata, attn_state)
|
||||
for layer_name in kv_cache_group_spec.layer_names:
|
||||
if "linear_attn" in layer_name:
|
||||
attn_metadata[
|
||||
|
||||
Reference in New Issue
Block a user