[Refact]Refact MLA/SFA weight prefetch to consist with moe weight prefetch (#6629)

### What this PR does / why we need it? 1. [Refact] Refact MLA/SFA weight prefetch to consist with moe weight prefetch 2. Remove duplicated o_proj weight prefetch in forward for MLA/SFA ### Does this PR introduce _any_ user-facing change? NA ### How was this patch tested? 1) Performance result: Perf test data: *) MLA: | | 1st test | 2nd test | Output Token Throughput(Avg) | Performance improvement percentage | | --- | --- | --- | --- | --- | | o_proj duplicate prefetch | 11.9669 token/s | 12.0287 token/s | 11.9978 | | o_proj no duplicate prefetch | 12.5594 token/s | 12.6216 token/s | 12.5905 | 4.94%| | single layer performace improve: 5%~8% *) SFA: | | 1st test | 2nd test | Output Token Throughput(Avg) | Performance improvement percentage | | --- | --- | --- | --- | --- | | o_proj duplicate prefetch | 13.0523 token/s | 13.1084 token/s | 13.08035 | | | o_proj no duplicate prefetch | 13.9844 token/s | 14.1678 token/s | 14.0761 | 7.6% | - vLLM version: v0.15.0 - vLLM main: d7e17aaacd --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-02-10 14:14:37 +08:00
parent 2a826b5fad
commit 66b60c9440
15 changed files with 98 additions and 56 deletions
--- a/tests/ut/ops/test_activation.py
+++ b/tests/ut/ops/test_activation.py
@@ -53,9 +53,12 @@ def test_QuickGELU_forward(mock_gelu, dummy_tensor, default_vllm_config):


@pytest.mark.skipif(is_310p_hw(), reason="non_310P device unittest case.")
+@patch("vllm_ascend.ops.activation.get_weight_prefetch_method",
+       return_value=MagicMock())
@patch("torch_npu.npu_swiglu", side_effect=lambda x: x + 1)
 def test_SiluAndMul_forward(
    mock_swiglu,
+    mock_get_weight_prefetch_method,
    dummy_tensor,
    default_vllm_config,
 ):
--- a/tests/ut/ops/test_fused_moe.py
+++ b/tests/ut/ops/test_fused_moe.py
@@ -296,6 +296,8 @@ class TestCumsumGroupList(TestBase):

 class TestUnifiedApplyMLP(TestBase):

+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method',
+           return_value=MagicMock())
    @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
    @patch('vllm_ascend.utils.get_ascend_device_type',
           return_value=AscendDeviceType.A3)
@@ -306,7 +308,8 @@ class TestUnifiedApplyMLP(TestBase):
                                                     mock_npu_dynamic_quant,
                                                     mock_npu_grouped_matmul,
                                                     mock_soc_version,
-                                                     mock_get_forward_context):
+                                                     mock_get_forward_context,
+                                                     mock_get_weight_prefetch_method):

        mock_forward_context = MagicMock()
        mock_forward_context.moe_comm_type = MoECommType.MC2
@@ -402,13 +405,16 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.dtype, torch.float16)

    @patch('vllm_ascend.ops.fused_moe.moe_mlp.HAS_TRITON', False)
+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method',
+           return_value=MagicMock())
    @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
    @patch('torch_npu.npu_dynamic_quant')
    def test_unified_apply_mlp_with_quantization_and_dynamic_scale(
            self, mock_npu_dynamic_quant, mock_npu_swiglu,
-            mock_npu_grouped_matmul, mock_get_forward_context):
+            mock_npu_grouped_matmul, mock_get_forward_context,
+            mock_get_weight_prefetch_method):

        mock_forward_context = MagicMock()
        mock_forward_context.with_quant = True
@@ -505,6 +511,8 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.shape, hidden_states.shape)
        self.assertEqual(result.dtype, torch.float16)

+    @patch("vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method",
+           return_value=MagicMock())
    @patch("vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context")
    @patch("torch_npu.npu_grouped_matmul")
    @patch("torch_npu.npu_swiglu")
@@ -513,7 +521,8 @@ class TestUnifiedApplyMLP(TestBase):
    def test_unified_apply_mlp_with_quantization_and_fusion_mlp(
            self, mock_npu_dynamic_quant, mock_npu_grouped_matmul_swiglu_quant,
            mock_npu_swiglu, mock_npu_grouped_matmul,
-            mock_get_forward_context):
+            mock_get_forward_context,
+            mock_get_weight_prefetch_method):

        mock_forward_context = MagicMock()
        mock_forward_context.with_quant = True
--- a/tests/ut/ops/test_linear.py
+++ b/tests/ut/ops/test_linear.py
@@ -83,7 +83,9 @@ class TestAscendUnquantizedLinearMethod(TestBase):

 class TestAscendRowParallelLinear(BaseLinearTest):

-    def test_mlp_optimize(self):
+    @patch("vllm_ascend.ops.linear_op.get_weight_prefetch_method",
+           return_value=MagicMock())
+    def test_mlp_optimize(self, mock_get_weight_prefetch_method):

        ascend_config._ASCEND_CONFIG = MagicMock()
        ascend_config._ASCEND_CONFIG.recompute_scheduler_enable = False
@@ -100,7 +102,9 @@ class TestAscendRowParallelLinear(BaseLinearTest):
        input_tensor = torch.randn(16, 8)
        linear(input_tensor)

-    def test_oproj_tp(self):
+    @patch("vllm_ascend.ops.linear_op.get_weight_prefetch_method",
+           return_value=MagicMock())
+    def test_oproj_tp(self, mock_get_weight_prefetch_method):

        config._current_vllm_config = MagicMock()