From dc960e798e05038afc6a5e9bbba7e1ba1c20bf45 Mon Sep 17 00:00:00 2001
From: whx <56632993+whx-sjtu@users.noreply.github.com>
Date: Thu, 30 Oct 2025 00:34:55 +0800
Subject: [PATCH] [BugFix] Fix mlapo accuracy problem related with weight
 processing. (#3850)

This PR fixes a mlapo accuracy problem related with weight processing.
Furthermore, add back mlapo related e2e test with quantized deepseek
model.


- vLLM version: v0.11.0rc3
- vLLM main:
https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac

Signed-off-by: whx-sjtu <2952154980@qq.com>
---
 vllm_ascend/attention/mla_v1.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index d57e7316..c5c29cbc 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -826,9 +826,9 @@ class AscendMLAImpl(MLAAttentionImpl):
             ..., self.q_lora_rank:].contiguous()
         q_a_proj_wt = self.fused_qkv_a_proj.weight.data[
             ..., :self.q_lora_rank].contiguous()
-        kv_a_proj_wt = kv_a_proj_wt.contiguous()
+        kv_a_proj_wt = kv_a_proj_wt.t().contiguous()
         kv_a_proj_wt = trans_rope_weight(kv_a_proj_wt, self.qk_rope_head_dim)
-        kv_a_proj_wt = kv_a_proj_wt.contiguous()
+        kv_a_proj_wt = kv_a_proj_wt.t().contiguous()
         wd_qkv = torch.cat((kv_a_proj_wt, q_a_proj_wt), dim=-1)
         wd_qkv = wd_qkv.t().contiguous()
         wd_qkv = transdata(wd_qkv,