[Feat][SP] Suport SP for VL MoE models (#7044)
### What this PR does / why we need it?
2nd PR for https://github.com/vllm-project/vllm-ascend/issues/5712,
extend SP to VL MoE models.
### Does this PR introduce _any_ user-facing change?
remove `sp_threshold` in additional config and reuse `sp_min_token_num`
from vLLM.
### How was this patch tested?
- Model: Qwen3-VL-30B-A3B,
- TP4 DP2
- 100 reqs
- max concurrency 1
| Seq length | Mean TTFT (ms) main | Mean TTFT (ms) this PR |
|------------|---------------------|------------------------|
| 4k | 429.40 | 323.3 |
| 16k | 1297.01 | 911.74 |
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -156,6 +156,16 @@ class NPUPlatform(Platform):
|
||||
def get_device_capability(cls, device_id: int = 0):
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def apply_config_platform_defaults(cls, vllm_config: VllmConfig) -> None:
|
||||
"""Apply Ascend-specific defaults. Set sp_min_token_num=1 when enable_sp and not set."""
|
||||
pass_config = vllm_config.compilation_config.pass_config
|
||||
if pass_config.enable_sp and pass_config.sp_min_token_num is None:
|
||||
from vllm_ascend.compilation.passes.sequence_parallelism import get_sp_min_token_num
|
||||
|
||||
pass_config.sp_min_token_num = get_sp_min_token_num(vllm_config)
|
||||
logger.info(f"set sp_min_token_num to {pass_config.sp_min_token_num}")
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return torch.npu.get_device_name(device_id)
|
||||
@@ -198,6 +208,7 @@ class NPUPlatform(Platform):
|
||||
|
||||
# initialize ascend config from vllm additional_config
|
||||
cls._fix_incompatible_config(vllm_config)
|
||||
|
||||
ascend_config = init_ascend_config(vllm_config)
|
||||
|
||||
if vllm_config.kv_transfer_config is not None:
|
||||
@@ -218,6 +229,7 @@ class NPUPlatform(Platform):
|
||||
if not isinstance(ascend_compilation_config, dict)
|
||||
else ascend_compilation_config
|
||||
)
|
||||
|
||||
ascend_config.update_compile_ranges_split_points()
|
||||
|
||||
if model_config and hasattr(model_config.hf_text_config, "index_topk"):
|
||||
@@ -363,7 +375,8 @@ class NPUPlatform(Platform):
|
||||
|
||||
if parallel_config and parallel_config.worker_cls == "auto":
|
||||
# TODO: this is a tricky way to disable `use_sequence_parallel_moe` in vllm.
|
||||
parallel_config.all2all_backend = "flashinfer_all2allv"
|
||||
if not vllm_config.compilation_config.pass_config.enable_sp:
|
||||
parallel_config.all2all_backend = "flashinfer_all2allv"
|
||||
if is_310p():
|
||||
parallel_config.worker_cls = "vllm_ascend._310p.worker_310p.NPUWorker310"
|
||||
elif ascend_config.xlite_graph_config.enabled:
|
||||
@@ -805,3 +818,7 @@ class NPUPlatform(Platform):
|
||||
"ignored on Ascend. Resetting to default (32)."
|
||||
)
|
||||
att_config.flash_attn_max_num_splits_for_cuda_graph = 32
|
||||
|
||||
@classmethod
|
||||
def use_custom_op_collectives(cls) -> bool:
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user