[Feature] optimize sp & qwen3 next support sp. (#3225)

This PR will accomplish the following tasks: **optimize SP** In the old version implementation, the first layer was all_reduce, which used rms to split chunks. We changed it to perform reduce_scatter on the embedding side, replace one all_reduce operation and one chunk with one reduce_scatter operation. **Support qwen3 next** Since Qwen3 Next includes a linear attention module, the prefix name of this module cannot take effect directly. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-10-13 23:02:12 +08:00
parent 31682961af
commit 6972df5951
10 changed files with 140 additions and 193 deletions
--- a/vllm_ascend/torchair/utils.py
+++ b/vllm_ascend/torchair/utils.py
@@ -208,11 +208,15 @@ def torchair_ops_patch():
    from vllm_ascend.ops.layernorm import AscendRMSNorm
    from vllm_ascend.ops.rotary_embedding import (
        AscendDeepseekScalingRotaryEmbedding, AscendRotaryEmbedding)
+    from vllm_ascend.ops.vocab_parallel_embedding import \
+        AscendVocabParallelEmbedding
    from vllm_ascend.torchair.ops import (torchair_activation,
                                          torchair_layernorm)
    from vllm_ascend.torchair.ops.torchair_rotary_embedding import (
        deepseek_rope_init_func, native_rope_deepseek_forward,
        qwen_rope_init_func, rope_forward)
+    from vllm_ascend.torchair.ops.torchair_vocab_parallel_embedding import \
+        vocab_embedding_forward

    AscendRotaryEmbedding.__init__ = qwen_rope_init_func  # type: ignore[method-assign]
    AscendRotaryEmbedding.forward_oot = rope_forward  # type: ignore[method-assign]
@@ -222,3 +226,4 @@ def torchair_ops_patch():

    AscendRMSNorm.forward_oot = torchair_layernorm.torchair_rmsnorm_forward_oot  # type: ignore[method-assign]
    AscendSiluAndMul.forward_oot = torchair_activation.torchair_silu_and_mul_forward_oot  # type: ignore[method-assign]
+    AscendVocabParallelEmbedding.forward = vocab_embedding_forward  # type: ignore[method-assign]