[Feature] optimize sp & qwen3 next support sp. (#3225)

This PR will accomplish the following tasks: **optimize SP** In the old version implementation, the first layer was all_reduce, which used rms to split chunks. We changed it to perform reduce_scatter on the embedding side, replace one all_reduce operation and one chunk with one reduce_scatter operation. **Support qwen3 next** Since Qwen3 Next includes a linear attention module, the prefix name of this module cannot take effect directly. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-10-13 23:02:12 +08:00
parent 31682961af
commit 6972df5951
10 changed files with 140 additions and 193 deletions
--- a/vllm_ascend/ops/linear.py
+++ b/vllm_ascend/ops/linear.py
@@ -34,8 +34,7 @@ from vllm.model_executor.layers.quantization.base_config import \
    QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs

-from vllm_ascend.ops.linear_op import (get_column_parallel_op,
-                                       get_row_parallel_op)
+from vllm_ascend.ops.linear_op import get_parallel_op


 # TODO(realliujiaxu): Remove this class after linear of vllm supports custom comm group
@@ -100,8 +99,8 @@ class AscendQKVParallelLinear(QKVParallelLinear):
        return_bias: bool = True,
        disable_tp: bool = False,
    ):
-        self.custom_op, _, tp_size = get_column_parallel_op(
-            disable_tp, prefix, self)
+        self.custom_op, _, tp_size = get_parallel_op(disable_tp, prefix, self,
+                                                     "column")
        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
        self.hidden_size = hidden_size
        self.head_size = head_size
@@ -173,8 +172,8 @@ class AscendMergedColumnParallelLinear(MergedColumnParallelLinear):
        return_bias: bool = True,
        disable_tp: bool = False,
    ):
-        self.custom_op, self.tp_rank, self.tp_size = get_column_parallel_op(
-            disable_tp, prefix, self)
+        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(
+            disable_tp, prefix, self, "column")
        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
        self.output_sizes = output_sizes
        assert all(output_size % self.tp_size == 0
@@ -222,8 +221,8 @@ class AscendRowParallelLinear(RowParallelLinear):
        return_bias: bool = True,
        disable_tp: bool = False,
    ):
-        self.custom_op, self.tp_rank, self.tp_size = get_row_parallel_op(
-            disable_tp, prefix, self)
+        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(
+            disable_tp, prefix, self, "row")
        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
        # Divide the weight matrix along the first dimension.
        self.input_size_per_partition = divide(input_size, self.tp_size)
@@ -304,8 +303,8 @@ class AscendColumnParallelLinear(ColumnParallelLinear):
        return_bias: bool = True,
        disable_tp: bool = False,
    ):
-        self.custom_op, self.tp_rank, self.tp_size = get_column_parallel_op(
-            disable_tp, prefix, self)
+        self.custom_op, self.tp_rank, self.tp_size = get_parallel_op(
+            disable_tp, prefix, self, "column")
        # TODO(realliujiaxu): Replace the initialization code below with super().__init__ after linear of vllm supports custom comm group
        self.input_size_per_partition = input_size
        self.output_size_per_partition = divide(output_size, self.tp_size)