[FEATURE][MTP] Support MTP > 1 (#2708)

### What this PR does / why we need it?
[RFC:Support MTP > 1 for
DeepSeek](https://github.com/vllm-project/vllm-ascend/issues/2745)

- [x] dp1 tp16
- [x] dp4 tp4
- [x] dp2 tp 8
- [x] torchair graph

- vLLM version: v0.10.1.1
- vLLM main:
c9f7081f9c

Signed-off-by: 1092626063 <1092626063@qq.com>
This commit is contained in:
1092626063
2025-09-05 09:11:22 +08:00
committed by GitHub
parent 83eb40a51c
commit 5b3646ab21
5 changed files with 206 additions and 88 deletions

View File

@@ -17,6 +17,7 @@
# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py
# isort: skip_file
import math
import types
from typing import Optional
@@ -427,6 +428,11 @@ class NPUTorchairModelRunner(NPUModelRunner):
for graph_batch_size in self.torchair_graph_batch_sizes:
cur_graph_batch_size = (graph_batch_size + tp_size -
1) // tp_size * tp_size
# MTP > 1: Cal LCMLeast Common Multiple with graph_batch_size and tp_size,
# Both adapter multi-dp and FIA operator
if self.speculative_config is not None and self.speculative_config.num_speculative_tokens > 1:
cur_graph_batch_size = (tp_size * graph_batch_size) \
// math.gcd(tp_size, graph_batch_size)
if cur_graph_batch_size not in new_graph_batch_sizes and \
cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens:
new_graph_batch_sizes.append(cur_graph_batch_size)