From 6ddfc413124b3e83674b9822d2a863676c4d752f Mon Sep 17 00:00:00 2001
From: pichangping <1337510399@qq.com>
Date: Wed, 25 Mar 2026 14:36:26 +0800
Subject: [PATCH] [bugfix] Fixed the error issue when overlaying MTP and full
 decode on DSV3.1 C8. (#7571)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…DSV3.1 C8.

### What this PR does / why we need it?
DeepSeek v3.1 C8 had a hanging issue when overlaying MTP and full graph
modes; this pull request resolves that issue.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?

- vLLM version: v0.18.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/ed359c497a728f08b5b41456c07a688ccd510fbc

---------

Signed-off-by: pichangping <1337510399@qq.com>
---
 vllm_ascend/attention/mla_v1.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index ef4220a5..06108660 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -1318,6 +1318,8 @@ class AscendMLAImpl(MLAAttentionImpl):
             sparse_mode = 3
             attn_mask = attn_metadata.decode.attn_mask  # type:ignore
             actual_seq_lengths = decode_meta.actual_seq_lengths_q
+            if self.fa_quant_layer:
+                dequant_scale_q_nope = dequant_scale_q_nope.view(num_tokens, self.num_heads)
         elif self.fa_quant_layer:
             attn_mask = None
             input_layout = "BSND_NBSD"
@@ -1403,7 +1405,10 @@ class AscendMLAImpl(MLAAttentionImpl):
                 weak_ref_tensors(softmax_lse),
             )
             if self.fa_quant_layer:
-                attn_params = attn_params + (dequant_scale_q_nope, self.fak_descale_float)  # type: ignore
+                attn_params = attn_params + (
+                    weak_ref_tensors(dequant_scale_q_nope),
+                    weak_ref_tensors(self.fak_descale_float),
+                )  # type: ignore
             else:
                 attn_params = attn_params + (None, None)  # type: ignore