[Feat] Adapted mtp function to Qwen3-next (#3918)

### What this PR does / why we need it? Adapts mtp function to Qwen3-next. - vLLM version: v0.11.0 - vLLM main: 83f478bb19 Signed-off-by: drslark <slarksblood@qq.com>
2025-11-07 16:39:03 +08:00
parent 46ef280105
commit 23b785fdfb
10 changed files with 244 additions and 15 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -252,6 +252,17 @@ class AscendAttentionMetadataBuilder:
        self.dcp_rank = get_decode_context_model_parallel_rank(
        ) if self.dcp_size > 1 else 0

+        self.speculative_config = vllm_config.speculative_config
+        self.decode_threshold = 1
+        if self.speculative_config:
+            spec_token_num = self.speculative_config.num_speculative_tokens
+            self.decode_threshold += spec_token_num
+            assert self.decode_threshold <= 16, f"decode_threshold exceeded \
+                npu_fused_infer_attention_score TND layout's limit of 16, \
+                got {self.decode_threshold}"
+
+        AscendAttentionMetadataBuilder.reorder_batch_threshold = self.decode_threshold
+
    def reorder_batch(self, input_batch,
                      scheduler_output: "SchedulerOutput") -> bool:
        return False