From 4270682383b4f4876296565bbb1227c1111b1d5e Mon Sep 17 00:00:00 2001
From: ttanzhiqiang <38750855+ttanzhiqiang@users.noreply.github.com>
Date: Sun, 15 Jun 2025 19:57:02 +0800
Subject: [PATCH] Waiting for BMM NZ support(Improve TPOP 2ms performance) 
 (#1131)

### What this PR does / why we need it?
W_UV/W_UK_T cannot be converted to nz, because this position will be
fused into transposebatchmatmul, which does not support nz. The weights
are actually converted back to nd in each run.

### Does this PR introduce _any_ user-facing change?
Use #1098 as the baseline, p90 TPOT 90.79ms->88.58ms, improve TPOP 2ms

### How was this patch tested?
use #1101

---------

Signed-off-by: ttanzhiqiang <389825161@qq.com>
---
 vllm_ascend/attention/mla_v1.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index 43cb71c..f741508 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -648,8 +648,10 @@ class AscendMLAImpl(MLAAttentionImpl):
         self.W_UV = W_UV.transpose(0, 1).contiguous()
         # Convert from (L, N, P) to (N, P, L)
         self.W_UK_T = W_UK.permute(1, 2, 0).contiguous()
-        self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29)
-        self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29)
+
+        # Waiting for BMM NZ support
+        # self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29)
+        # self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29)
 
     def _compute_prefill_context(
         self,