[Feature] optimize sp & qwen3 next support sp. (#3225)

This PR will accomplish the following tasks: **optimize SP** In the old version implementation, the first layer was all_reduce, which used rms to split chunks. We changed it to perform reduce_scatter on the embedding side, replace one all_reduce operation and one chunk with one reduce_scatter operation. **Support qwen3 next** Since Qwen3 Next includes a linear attention module, the prefix name of this module cannot take effect directly. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-10-13 23:02:12 +08:00
parent 31682961af
commit 6972df5951
10 changed files with 140 additions and 193 deletions
--- a/tests/ut/ops/test_layernorm.py
+++ b/tests/ut/ops/test_layernorm.py
@@ -9,12 +9,6 @@ from tests.ut.base import PytestBase
 from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod


-def mock_maybe_chunk_residual(x, residual):
-    if x.size(0) != residual.size(0):
-        return residual[:4]
-    return residual
-
-
 def mock_rms_norm(x, weight, eps):
    return x + 1, None

@@ -36,8 +30,6 @@ class TestAscendRMSNorm(PytestBase):

    @pytest.fixture(autouse=True)
    def context(self, mocker: MockerFixture):
-        mocker.patch("torch.ops.vllm.maybe_chunk_residual",
-                     side_effect=mock_maybe_chunk_residual)
        mocker.patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
        mocker.patch("torch_npu.npu_add_rms_norm",
                     side_effect=mock_add_rms_norm)
@@ -66,21 +58,6 @@ class TestAscendRMSNorm(PytestBase):

            assert torch.allclose(x_out, x_out_expected)

-    # Test case for flashcomm_v1 scenario
-    def test_forward_oot_with_flashcomm_v1(self):
-        layer = RMSNorm(hidden_size=512, eps=1e-05)
-        x = torch.randn(4, 512, dtype=torch.bfloat16)
-        residual = torch.randn(16, 512, dtype=torch.bfloat16)
-
-        x_out, residual_out = layer.forward_oot(x, residual)
-
-        x_out_expected = 2 * x
-        residual_out_expected = 2 * residual[:4]
-
-        assert residual_out.size(0) == 4
-        assert torch.allclose(x_out, x_out_expected)
-        assert torch.allclose(residual_out, residual_out_expected)
-
    # Test case for addrmsnorm + w8a8 quant fusion
    def test_forward_oot_with_quant_fusion(self, mocker: MockerFixture):
        mock_is_310p = mocker.patch("vllm_ascend.utils.is_310p")