[Feat][Graph] Support MTP for ACL Graph (#2932)

### What this PR does / why we need it? This PR depends on the merge of #2707 and has adapted the aclgraph functionality to support MTP. ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: 2b85697031 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-09-18 14:05:33 +08:00
parent cef43b524e
commit 6681dde902
7 changed files with 73 additions and 11 deletions
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -187,7 +187,14 @@ class AscendMLAMetadataBuilder:
                           self.block_size - 1) // self.block_size
        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled

+        self.speculative_config = vllm_config.speculative_config
        self.decode_threshold = 1
+        if self.speculative_config:
+            spec_token_num = self.speculative_config.num_speculative_tokens
+            self.decode_threshold += spec_token_num
+            assert self.decode_threshold <= 16, f"decode_threshold exceeded \
+                npu_fused_infer_attention_score TND layout's limit of 16, \
+                got {self.decode_threshold}"

        if self.chunked_prefill_enabled:
            self.chunked_prefill_workspace_size = min(
@@ -275,7 +282,6 @@ class AscendMLAMetadataBuilder:
        num_actual_tokens = common_attn_metadata.num_actual_tokens
        query_start_loc = common_attn_metadata.query_start_loc
        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-        # TODO(xyx): remove the if condition after mla supports torch mode speculative decoding
        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
            split_decodes_and_prefills(common_attn_metadata, decode_threshold=self.decode_threshold)
        assert num_decodes + num_prefills == num_reqs