[Refact]Refact MLA/SFA weight prefetch to consist with moe weight prefetch (#6629)

### What this PR does / why we need it?
1. [Refact] Refact MLA/SFA weight prefetch to consist with moe weight
prefetch
2. Remove duplicated o_proj weight prefetch in forward for MLA/SFA

### Does this PR introduce _any_ user-facing change?
NA

### How was this patch tested?

1) Performance result:
Perf test data:
*) MLA:

| | 1st test | 2nd test | Output Token Throughput(Avg) | Performance
improvement percentage |
| --- | --- | --- | --- | --- |
| o_proj duplicate prefetch | 11.9669 token/s | 12.0287 token/s |
11.9978 |
| o_proj no duplicate prefetch | 12.5594 token/s | 12.6216 token/s |
12.5905 | 4.94%| |

single layer performace improve: 5%~8%

*) SFA:

| | 1st test | 2nd test | Output Token Throughput(Avg) | Performance
improvement percentage |
| --- | --- | --- | --- | --- |
| o_proj duplicate prefetch | 13.0523 token/s | 13.1084 token/s |
13.08035 | |
| o_proj no duplicate prefetch | 13.9844 token/s | 14.1678 token/s |
14.0761 | 7.6% |

- vLLM version: v0.15.0
- vLLM main:
d7e17aaacd

---------

Signed-off-by: leo-pony <nengjunma@outlook.com>
This commit is contained in:
Nengjun Ma
2026-02-10 14:14:37 +08:00
committed by GitHub
parent 2a826b5fad
commit 66b60c9440
15 changed files with 98 additions and 56 deletions

View File

@@ -296,6 +296,8 @@ class TestCumsumGroupList(TestBase):
class TestUnifiedApplyMLP(TestBase):
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method',
return_value=MagicMock())
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
@patch('vllm_ascend.utils.get_ascend_device_type',
return_value=AscendDeviceType.A3)
@@ -306,7 +308,8 @@ class TestUnifiedApplyMLP(TestBase):
mock_npu_dynamic_quant,
mock_npu_grouped_matmul,
mock_soc_version,
mock_get_forward_context):
mock_get_forward_context,
mock_get_weight_prefetch_method):
mock_forward_context = MagicMock()
mock_forward_context.moe_comm_type = MoECommType.MC2
@@ -402,13 +405,16 @@ class TestUnifiedApplyMLP(TestBase):
self.assertEqual(result.dtype, torch.float16)
@patch('vllm_ascend.ops.fused_moe.moe_mlp.HAS_TRITON', False)
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method',
return_value=MagicMock())
@patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
@patch('torch_npu.npu_grouped_matmul')
@patch('torch_npu.npu_swiglu')
@patch('torch_npu.npu_dynamic_quant')
def test_unified_apply_mlp_with_quantization_and_dynamic_scale(
self, mock_npu_dynamic_quant, mock_npu_swiglu,
mock_npu_grouped_matmul, mock_get_forward_context):
mock_npu_grouped_matmul, mock_get_forward_context,
mock_get_weight_prefetch_method):
mock_forward_context = MagicMock()
mock_forward_context.with_quant = True
@@ -505,6 +511,8 @@ class TestUnifiedApplyMLP(TestBase):
self.assertEqual(result.shape, hidden_states.shape)
self.assertEqual(result.dtype, torch.float16)
@patch("vllm_ascend.ops.fused_moe.moe_mlp.get_weight_prefetch_method",
return_value=MagicMock())
@patch("vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context")
@patch("torch_npu.npu_grouped_matmul")
@patch("torch_npu.npu_swiglu")
@@ -513,7 +521,8 @@ class TestUnifiedApplyMLP(TestBase):
def test_unified_apply_mlp_with_quantization_and_fusion_mlp(
self, mock_npu_dynamic_quant, mock_npu_grouped_matmul_swiglu_quant,
mock_npu_swiglu, mock_npu_grouped_matmul,
mock_get_forward_context):
mock_get_forward_context,
mock_get_weight_prefetch_method):
mock_forward_context = MagicMock()
mock_forward_context.with_quant = True