[Feat] Flash comm allgher ep (#3334)

Support flash comm v1(Sequence Parallelism) for Allgather EP. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com> Co-authored-by: zhaozx-cn <zhaozx2116@163.com>
2025-10-15 19:36:32 +08:00
parent 8abe517870
commit f69a83b7ba
15 changed files with 283 additions and 78 deletions
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -500,9 +500,12 @@ class TestAscendMLAImpl(TestBase):
        mock_up_proj.assert_called_once()
        mock_npu_fused_infer_attention_score.assert_called_once()

+    @patch("torch.ops.vllm.maybe_all_gather_and_maybe_unpad")
    @patch("vllm_ascend.attention.mla_v1.maybe_npu_prefetch")
-    def test_mla_preprocess(self, magic_npu_fetch):
+    def test_mla_preprocess(self, magic_npu_fetch,
+                            mock_maybe_all_gather_and_maybe_unpad):
        magic_npu_fetch.return_value = MagicMock()
+        mock_maybe_all_gather_and_maybe_unpad.side_effect = lambda x, label: x
        batch_size = 4
        seq_len = 8
        hidden_size = 1024