[2/N][refactor] torchair deepseek mla backend refactor (#2459)

### What this PR does / why we need it? This PR move current unified mla backend to torchair folder and remove torchair-related code in attention/mla_v1.py (1.3k -> 0.9k). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Running eager mode with mla backend, and torchair mode with code before [2445](https://github.com/vllm-project/vllm-ascend/pull/2445) - vLLM version: v0.10.0 - vLLM main: f571ff8eb6 Signed-off-by: linfeng-yuan <1102311262@qq.com>
2025-08-21 14:02:30 +08:00
parent 67a222c383
commit 0ca3f48c90
7 changed files with 2192 additions and 747 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -235,12 +235,18 @@ class NPUPlatform(Platform):
            raise ValueError("vLLM Ascend does not support V0 engine.")

        use_torchair = get_ascend_config().torchair_graph_config.enabled
-        if use_mla:
-            return "vllm_ascend.attention.mla_v1.AscendMLABackend"
-        elif use_torchair:
-            return "vllm_ascend.torchair.torchair_attention.AscendAttentionTorchairBackend"
-        else:
-            return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
+        # choose attention backend based on use_mla and use_torchair
+        backend_map = {
+            (True, True):
+            "vllm_ascend.torchair.torchair_mla.AscendMLATorchairBackend",
+            (True, False):
+            "vllm_ascend.attention.mla_v1.AscendMLABackend",
+            (False, True):
+            "vllm_ascend.torchair.torchair_attention.AscendAttentionTorchairBackend",
+            (False, False):
+            "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
+        }
+        return backend_map[(use_mla, use_torchair)]

    @classmethod
    def get_punica_wrapper(cls) -> str: