Drop torchair (#4814)

aclgraph is stable and fast now. Let's drop torchair graph mode now. TODO: some logic to adapt torchair should be cleaned up as well. We'll do it in the following PR. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-10 09:20:40 +08:00
parent ba9cda9dfd
commit 835b4c8f1d
84 changed files with 77 additions and 16881 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -857,7 +857,6 @@ class AscendMLAImpl(MLAAttentionImpl):
        ascend_config = get_ascend_config()
        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
        self.enable_prefetch = ascend_config.weight_prefetch_config.enabled
-        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz

        vllm_config = get_current_vllm_config()
        self.ring_mla_mask_size = 512
@@ -1248,7 +1247,7 @@ class AscendMLAImpl(MLAAttentionImpl):
        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
        kv_no_split = kv_no_split.view(
            B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
-        cache_mode = "PA_NZ" if self.enable_kv_nz else "PA"
+        cache_mode = "PA"
        k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache(
            kv_no_split,
            self.kv_a_layernorm.weight,
@@ -1276,7 +1275,7 @@ class AscendMLAImpl(MLAAttentionImpl):
        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
        kv_no_split = kv_no_split.view(
            B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
-        cache_mode = "PA_NZ" if self.enable_kv_nz else "PA"
+        cache_mode = "PA"
        _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache(
            kv_no_split,
            self.kv_a_layernorm.weight,
@@ -1318,18 +1317,11 @@ class AscendMLAImpl(MLAAttentionImpl):
        # shape of knope/k_pe for npu graph mode should be:
        # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim]
        actual_seq_lengths = None
-        if self.enable_kv_nz:
-            k_nope = k_nope.view(-1, self.num_kv_heads,
-                                 self.kv_lora_rank // 16, block_size, 16)
-            k_pe = k_pe.view(-1, self.num_kv_heads,
-                             self.qk_rope_head_dim // 16, block_size, 16)
-            input_layout = "BSND"
-        else:
-            k_nope = k_nope.view(-1, self.num_kv_heads, block_size,
-                                 self.kv_lora_rank)
-            k_pe = k_pe.view(-1, self.num_kv_heads, block_size,
-                             self.qk_rope_head_dim)
-            input_layout = "BNSD"
+        k_nope = k_nope.view(-1, self.num_kv_heads, block_size,
+                             self.kv_lora_rank)
+        k_pe = k_pe.view(-1, self.num_kv_heads, block_size,
+                         self.qk_rope_head_dim)
+        input_layout = "BNSD"

        if attn_metadata.attn_state in [
                AscendAttentionState.SpecDecoding,
@@ -1346,14 +1338,9 @@ class AscendMLAImpl(MLAAttentionImpl):
            spec_attn_mask = attn_metadata.decode.attn_mask  # type:ignore
            actual_seq_lengths = decode_meta.actual_seq_lengths_q
        else:
-            if self.enable_kv_nz:
-                q_nope = q_nope.view(num_tokens, 1, self.num_heads,
-                                     -1).contiguous()
-                q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1)
-            else:
-                q_nope = q_nope.view(num_tokens, self.num_heads, 1,
-                                     -1).contiguous()
-                q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
+            q_nope = q_nope.view(num_tokens, self.num_heads, 1,
+                                 -1).contiguous()
+            q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
            sparse_mode = 0
            spec_attn_mask = None