[Feat] Flash comm allgher ep (#3334)

Support flash comm v1(Sequence Parallelism) for Allgather EP.

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: realliujiaxu <realliujiaxu@163.com>
Co-authored-by: zhaozx-cn <zhaozx2116@163.com>
This commit is contained in:
realliujiaxu
2025-10-15 19:36:32 +08:00
committed by GitHub
parent 8abe517870
commit f69a83b7ba
15 changed files with 283 additions and 78 deletions

View File

@@ -20,6 +20,7 @@
import copy
import gc
import itertools
import math
import re
import time
from collections import defaultdict
@@ -128,8 +129,8 @@ from vllm_ascend.spec_decode.interface import SpecDcodeType
from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
AscendSocVersion, ProfileExecuteDuration,
get_ascend_soc_version, is_310p, is_enable_nz,
lmhead_tp_enable)
enable_sp, get_ascend_soc_version, is_310p,
is_enable_nz, lmhead_tp_enable)
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
if TYPE_CHECKING:
@@ -1210,6 +1211,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
# Add padding to the batch size.
num_input_tokens = self.vllm_config.pad_for_cudagraph(
total_num_scheduled_tokens)
elif self.use_aclgraph and enable_sp(self.vllm_config):
# When using aclgraph, if total_num_scheduled_tokens exceeds the maximum graph size,
# the model will fall back to running its FX graph in eager mode.
# In this case, when sequence parallelism is enabled, we need to pad tokens to align
# with tp_size because pad_size cannot be captured by the FX graph
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
num_input_tokens = math.ceil(
total_num_scheduled_tokens / tp_size) * tp_size
else:
# Eager mode.
num_input_tokens = total_num_scheduled_tokens
@@ -1850,7 +1859,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
raise ValueError(f"Unsupported soc_version: {soc_version}")
if moe_comm_type == MoECommType.ALLGATHER and with_prefill:
moe_comm_type = MoECommType.NAIVE_MULTICAST
if enable_sp():
moe_comm_type = MoECommType.ALLGATHER
else:
moe_comm_type = MoECommType.NAIVE_MULTICAST
# PanguProMoE only supports allgather
if model_type == "PanguProMoE":
@@ -2314,6 +2326,12 @@ class NPUModelRunner(LoRAModelRunnerMixin):
CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
}
# In multi-DP scenarios, there may be situations where all DP groups are executing dummy runs.
# If sequence parallelism is enabled, it is essential to ensure that num_tokens is divisible by tp_size.
if self.use_aclgraph and enable_sp(self.vllm_config):
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
num_tokens = math.ceil(num_tokens / tp_size) * tp_size
# Padding for DP
(num_tokens, num_tokens_across_dp, with_prefill,
_) = self._sync_metadata_across_dp(num_tokens, with_prefill, False)