[FEAT] Refactor spec decode to support efficient padded speculation (#3528)

### What this PR does / why we need it? 1. Refactor the file `mtp_proposer.py`, splits torchair related codes into `mtp_torchair_proposer.py` 2. According to https://github.com/vllm-project/vllm/pull/24539, implements padded speculative decoding as described in https://github.com/vllm-project/vllm/issues/21984. ### Does this PR introduce _any_ user-facing change? User can use `disable_padded_drafter_batch` to disable/enable padded speculation, default is `False`. offline example: ``` speculative_config={"method": "deepseek_mtp", "num_speculative_tokens": 1, "disable_padded_drafter_batch": False} ``` ### How was this patch tested? - [x] egaer with pad/unpad: - [x] aclgraph with pad/unpad - [x] torchair with pad/unpad performance test of deepseek-r1 with tp16、dp1 aclgraph with pad ITL: 168ms aclgraph with unpad ITL: 169ms original: 178ms - vLLM version: v0.11.0rc3 - vLLM main: 83f478bb19 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-10-30 16:53:05 +08:00
parent 10772d94e3
commit eff3e5fc6f
7 changed files with 1203 additions and 440 deletions
--- a/vllm_ascend/spec_decode/init.py
+++ b/vllm_ascend/spec_decode/init.py
@@ -19,14 +19,21 @@
 from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
 from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
 from vllm_ascend.spec_decode.ngram_proposer import NgramProposer
+from vllm_ascend.torchair.torchair_mtp_proposer import TorchairMtpProposer


-def get_spec_decode_method(method, vllm_config, device, runner):
+def get_spec_decode_method(method,
+                           vllm_config,
+                           device,
+                           runner,
+                           is_torchair_graph=False):
    if method == "ngram":
        return NgramProposer(vllm_config, device, runner)
    elif method in ["eagle", "eagle3"]:
        return EagleProposer(vllm_config, device, runner)
    elif method == 'deepseek_mtp':
+        if is_torchair_graph:
+            return TorchairMtpProposer(vllm_config, device, runner)
        return MtpProposer(vllm_config, device, runner)
    else:
        raise ValueError("Unknown speculative decoding method: "