Refactor AscendMultiHeadLatentAttention (#2826)

### What this PR does / why we need it? Register AscendMultiHeadLatentAttention as CustomOP, following vllm changes ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added/existing test. - vLLM version: main - vLLM main: b23fb78623 --------- Signed-off-by: Icey <1790571317@qq.com>
2025-09-10 11:26:11 +08:00
parent 168ad600b5
commit aa4d2a91ed
4 changed files with 170 additions and 48 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -529,6 +529,10 @@ def register_ascend_customop():
    from vllm_ascend.ops.common_fused_moe import AscendFusedMoE
    CustomOp.register_oot(_decorated_op_cls=AscendFusedMoE, name="FusedMoE")

+    from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
+    CustomOp.register_oot(_decorated_op_cls=AscendMultiHeadLatentAttention,
+                          name="MultiHeadLatentAttention")
+
    # NOTE: Keep this at last to ensure all custom actions are registered
    _ASCEND_CUSTOMOP_IS_REIGISTERED = True