From 6ddfc413124b3e83674b9822d2a863676c4d752f Mon Sep 17 00:00:00 2001 From: pichangping <1337510399@qq.com> Date: Wed, 25 Mar 2026 14:36:26 +0800 Subject: [PATCH] [bugfix] Fixed the error issue when overlaying MTP and full decode on DSV3.1 C8. (#7571) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …DSV3.1 C8. ### What this PR does / why we need it? DeepSeek v3.1 C8 had a hanging issue when overlaying MTP and full graph modes; this pull request resolves that issue. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.18.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ed359c497a728f08b5b41456c07a688ccd510fbc --------- Signed-off-by: pichangping <1337510399@qq.com> --- vllm_ascend/attention/mla_v1.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index ef4220a5..06108660 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -1318,6 +1318,8 @@ class AscendMLAImpl(MLAAttentionImpl): sparse_mode = 3 attn_mask = attn_metadata.decode.attn_mask # type:ignore actual_seq_lengths = decode_meta.actual_seq_lengths_q + if self.fa_quant_layer: + dequant_scale_q_nope = dequant_scale_q_nope.view(num_tokens, self.num_heads) elif self.fa_quant_layer: attn_mask = None input_layout = "BSND_NBSD" @@ -1403,7 +1405,10 @@ class AscendMLAImpl(MLAAttentionImpl): weak_ref_tensors(softmax_lse), ) if self.fa_quant_layer: - attn_params = attn_params + (dequant_scale_q_nope, self.fak_descale_float) # type: ignore + attn_params = attn_params + ( + weak_ref_tensors(dequant_scale_q_nope), + weak_ref_tensors(self.fak_descale_float), + ) # type: ignore else: attn_params = attn_params + (None, None) # type: ignore