Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now. TODO: some logic to adapt torchair should be cleaned up as well. We'll do it in the following PR. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 09:20:40 +08:00
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions
--- a/vllm_ascend/attention/sfa_v1.py
+++ b/vllm_ascend/attention/sfa_v1.py
@@ -345,7 +345,6 @@ class AscendSFAImpl(MLAAttentionImpl):
        ascend_config = get_ascend_config()
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
        self.enable_prefetch = ascend_config.weight_prefetch_config.enabled
-        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
        self.enable_mlapo = envs.VLLM_ASCEND_ENABLE_MLAPO

        assert self.indexer is not None, "Indexer is required for DSA."
@@ -534,7 +533,7 @@ class AscendSFAImpl(MLAAttentionImpl):
        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
        kv_no_split = kv_no_split.view(
            B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
-        cache_mode = "PA_NZ" if self.enable_kv_nz else "PA"
+        cache_mode = "PA"

        if self.enable_sfa_cp:
            assert slots_cp is not None