[bugfix] Fixed the error issue when overlaying MTP and full decode on DSV3.1 C8. (#7571)
…DSV3.1 C8.
### What this PR does / why we need it?
DeepSeek v3.1 C8 had a hanging issue when overlaying MTP and full graph
modes; this pull request resolves that issue.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.18.0
- vLLM main:
ed359c497a
---------
Signed-off-by: pichangping <1337510399@qq.com>
This commit is contained in:
@@ -1318,6 +1318,8 @@ class AscendMLAImpl(MLAAttentionImpl):
|
|||||||
sparse_mode = 3
|
sparse_mode = 3
|
||||||
attn_mask = attn_metadata.decode.attn_mask # type:ignore
|
attn_mask = attn_metadata.decode.attn_mask # type:ignore
|
||||||
actual_seq_lengths = decode_meta.actual_seq_lengths_q
|
actual_seq_lengths = decode_meta.actual_seq_lengths_q
|
||||||
|
if self.fa_quant_layer:
|
||||||
|
dequant_scale_q_nope = dequant_scale_q_nope.view(num_tokens, self.num_heads)
|
||||||
elif self.fa_quant_layer:
|
elif self.fa_quant_layer:
|
||||||
attn_mask = None
|
attn_mask = None
|
||||||
input_layout = "BSND_NBSD"
|
input_layout = "BSND_NBSD"
|
||||||
@@ -1403,7 +1405,10 @@ class AscendMLAImpl(MLAAttentionImpl):
|
|||||||
weak_ref_tensors(softmax_lse),
|
weak_ref_tensors(softmax_lse),
|
||||||
)
|
)
|
||||||
if self.fa_quant_layer:
|
if self.fa_quant_layer:
|
||||||
attn_params = attn_params + (dequant_scale_q_nope, self.fak_descale_float) # type: ignore
|
attn_params = attn_params + (
|
||||||
|
weak_ref_tensors(dequant_scale_q_nope),
|
||||||
|
weak_ref_tensors(self.fak_descale_float),
|
||||||
|
) # type: ignore
|
||||||
else:
|
else:
|
||||||
attn_params = attn_params + (None, None) # type: ignore
|
attn_params = attn_params + (None, None) # type: ignore
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user