【main】SP For Qwen3 MoE (#2209)
### What this PR does / why we need it?
Qwen3 MoE supports SP. In scenarios like AlltoAll, AlltoAllv, and MC2,
replacing AllReduce with Reduce-Scatter and AllGather achieves
computational benefits in norm operations while saving one AllGather
communication. This feature is enabled during the P-phase and delivers
notable gains in long-sequence scenarios (e.g., 16k–25k), with
performance improvements reaching 5%–10%.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
```
compilation_config={
"pass_config":{
"enable_sequence_parallelism": True
}
},
enable_expert_parallel=True,
```
- vLLM version: v0.10.0
- vLLM main:
9edd1db02b
---------
Signed-off-by: libaokui <libaokui@huawei.com>
Co-authored-by: libaokui <libaokui@huawei.com>
This commit is contained in:
@@ -234,3 +234,27 @@ def test_models_distributed_DeepSeek_W4A8DYNAMIC():
|
||||
},
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts, max_tokens)
|
||||
|
||||
|
||||
def test_sp_for_qwen3_moe() -> None:
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5,
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download("Qwen/Qwen3-30B-A3B"),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
compilation_config={
|
||||
"pass_config": {
|
||||
"enable_sequence_parallelism": True
|
||||
}
|
||||
},
|
||||
enable_expert_parallel=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
Reference in New Issue
Block a user