[Refactor] cache cos/sin in mla & remove parameter model in builder. (#5277)

RFC: https://github.com/vllm-project/vllm-ascend/issues/4629 1. Cache cos/sin in mla 2. AttentionBuilder inherits from the original class of vllm. version: release/v0.13.0 - vLLM main: ad32e3e19c --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-12-28 10:35:07 +08:00
parent 24328aaf00
commit dbe4c338f2
10 changed files with 167 additions and 224 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -20,7 +20,6 @@ from enum import Enum
 from typing import ClassVar, List, Optional, Tuple, Type

 import torch
-import torch.nn as nn
 import torch_npu
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                              AttentionLayer, AttentionType)
@@ -29,7 +28,8 @@ from vllm.attention.backends.registry import (AttentionBackendEnum,
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.utils.math_utils import cdiv
-from vllm.v1.attention.backends.utils import AttentionCGSupport
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder)
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import AttentionSpec

@@ -170,7 +170,7 @@ class AscendMetadata:
    model_runner_type: str = ""


-class AscendAttentionMetadataBuilder:
+class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
    # Does this backend/builder support ACL Graphs for attention (default: no).
    aclgraph_support: ClassVar[AttentionCGSupport] = \
        AttentionCGSupport.ALWAYS
@@ -217,8 +217,8 @@ class AscendAttentionMetadataBuilder:
        self,
        common_prefix_len: int,
        common_attn_metadata: AscendCommonAttentionMetadata,
-        model: Optional[nn.Module] = None,
-    ):
+        fast_build: bool = False,
+    ) -> AscendMetadata:
        num_reqs = common_attn_metadata.num_reqs
        num_actual_tokens = common_attn_metadata.num_actual_tokens
        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[:
@@ -261,7 +261,6 @@ class AscendAttentionMetadataBuilder:
        self,
        common_attn_metadata: AscendCommonAttentionMetadata,
        attn_state: AscendAttentionState = AscendAttentionState.DecodeOnly,
-        model: Optional[nn.Module] = None,
    ):
        if attn_state == AscendAttentionState.DecodeOnly:
            attn_metadata = self.build(