[2/N][refactor] torchair deepseek mla backend refactor (#2459)

### What this PR does / why we need it?
This PR move current unified mla backend to torchair folder and remove
torchair-related code in attention/mla_v1.py (1.3k -> 0.9k).

 
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Running eager mode with mla backend, and torchair mode with code before
[2445](https://github.com/vllm-project/vllm-ascend/pull/2445)


- vLLM version: v0.10.0
- vLLM main:
f571ff8eb6

Signed-off-by: linfeng-yuan <1102311262@qq.com>
This commit is contained in:
linfeng-yuan
2025-08-21 14:02:30 +08:00
committed by GitHub
parent 67a222c383
commit 0ca3f48c90
7 changed files with 2192 additions and 747 deletions

View File

@@ -235,12 +235,18 @@ class NPUPlatform(Platform):
raise ValueError("vLLM Ascend does not support V0 engine.")
use_torchair = get_ascend_config().torchair_graph_config.enabled
if use_mla:
return "vllm_ascend.attention.mla_v1.AscendMLABackend"
elif use_torchair:
return "vllm_ascend.torchair.torchair_attention.AscendAttentionTorchairBackend"
else:
return "vllm_ascend.attention.attention_v1.AscendAttentionBackend"
# choose attention backend based on use_mla and use_torchair
backend_map = {
(True, True):
"vllm_ascend.torchair.torchair_mla.AscendMLATorchairBackend",
(True, False):
"vllm_ascend.attention.mla_v1.AscendMLABackend",
(False, True):
"vllm_ascend.torchair.torchair_attention.AscendAttentionTorchairBackend",
(False, False):
"vllm_ascend.attention.attention_v1.AscendAttentionBackend"
}
return backend_map[(use_mla, use_torchair)]
@classmethod
def get_punica_wrapper(cls) -> str: