[Refact]Refact MLA/SFA weight prefetch to consist with moe weight prefetch (#6629)

### What this PR does / why we need it? 1. [Refact] Refact MLA/SFA weight prefetch to consist with moe weight prefetch 2. Remove duplicated o_proj weight prefetch in forward for MLA/SFA ### Does this PR introduce _any_ user-facing change? NA ### How was this patch tested? 1) Performance result: Perf test data: *) MLA: | | 1st test | 2nd test | Output Token Throughput(Avg) | Performance improvement percentage | | --- | --- | --- | --- | --- | | o_proj duplicate prefetch | 11.9669 token/s | 12.0287 token/s | 11.9978 | | o_proj no duplicate prefetch | 12.5594 token/s | 12.6216 token/s | 12.5905 | 4.94%| | single layer performace improve: 5%~8% *) SFA: | | 1st test | 2nd test | Output Token Throughput(Avg) | Performance improvement percentage | | --- | --- | --- | --- | --- | | o_proj duplicate prefetch | 13.0523 token/s | 13.1084 token/s | 13.08035 | | | o_proj no duplicate prefetch | 13.9844 token/s | 14.1678 token/s | 14.0761 | 7.6% | - vLLM version: v0.15.0 - vLLM main: d7e17aaacd --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-02-10 14:14:37 +08:00
parent 2a826b5fad
commit 66b60c9440
15 changed files with 98 additions and 56 deletions
--- a/vllm_ascend/ops/activation.py
+++ b/vllm_ascend/ops/activation.py
@@ -34,9 +34,7 @@ class AscendSiluAndMul(SiluAndMul):
        import torch_npu

        weight_prefetch_method = get_weight_prefetch_method()
-        if weight_prefetch_method:
-            weight_prefetch_method.maybe_prefetch_mlp_weight_preprocess(weight_prefetch_method.MLP_DOWN, x)
+        weight_prefetch_method.maybe_prefetch_mlp_weight_preprocess(weight_prefetch_method.MLP_DOWN, x)
        out = torch_npu.npu_swiglu(x)
-        if weight_prefetch_method:
-            weight_prefetch_method.maybe_prefetch_mlp_weight_postprocess(out)
+        weight_prefetch_method.maybe_prefetch_mlp_weight_postprocess(out)
        return out
--- a/vllm_ascend/ops/fused_moe/experts_selector.py
+++ b/vllm_ascend/ops/fused_moe/experts_selector.py
@@ -59,8 +59,7 @@ def select_experts(
    """
    # prefetch w1_w3_proj.weight preprocess
    weight_prefetch_method = get_weight_prefetch_method()
-    if weight_prefetch_method:
-        weight_prefetch_method.maybe_prefetch_moe_weight_preprocess(hidden_states, "gate_up")
+    weight_prefetch_method.maybe_prefetch_moe_weight_preprocess(hidden_states, "gate_up")
    is_support_npu_moe_gating_top_k = check_npu_moe_gating_top_k(
        hidden_states=hidden_states,
        top_k=top_k,
--- a/vllm_ascend/ops/fused_moe/moe_mlp.py
+++ b/vllm_ascend/ops/fused_moe/moe_mlp.py
@@ -100,8 +100,7 @@ def quant_apply_mlp(
    _output_dtype = w2_scale[0].dtype

    weight_prefetch_method = get_weight_prefetch_method()
-    if weight_prefetch_method:
-        weight_prefetch_method.maybe_prefetch_moe_weight_postprocess(hidden_states)
+    weight_prefetch_method.maybe_prefetch_moe_weight_postprocess(hidden_states)
    is_mc2 = get_forward_context().moe_comm_type == MoECommType.MC2
    if w1_scale_bias is None and w1_offset is None and is_mc2:
        if _custom_gmm_swiglu_enabled(fusion, dynamic_eplb):
--- a/vllm_ascend/ops/layernorm.py
+++ b/vllm_ascend/ops/layernorm.py
@@ -66,8 +66,7 @@ class AscendRMSNorm(RMSNorm):
            x.add_(self.bias)

        weight_prefetch_method = get_weight_prefetch_method()
-        if weight_prefetch_method:
-            weight_prefetch_method.maybe_prefetch_mlp_weight_postprocess(x)
+        weight_prefetch_method.maybe_prefetch_mlp_weight_postprocess(x)
        return x


--- a/vllm_ascend/ops/linear_op.py
+++ b/vllm_ascend/ops/linear_op.py
@@ -149,10 +149,9 @@ class CustomRowParallelOp(CustomLinearOp):
    def apply(self, input_):
        output, output_bias = self.apply_impl(input_)
        weight_prefetch_method = get_weight_prefetch_method()
-        if weight_prefetch_method:
-            weight_prefetch_method.maybe_prefetch_mlp_weight_preprocess(
-                weight_prefetch_method.MLP_GATE_UP, output, self.prefix
-            )
+        weight_prefetch_method.maybe_prefetch_mlp_weight_preprocess(
+            weight_prefetch_method.MLP_GATE_UP, output, self.prefix
+        )

        if not self.return_bias:
            return output
--- a/vllm_ascend/ops/weight_prefetch.py
+++ b/vllm_ascend/ops/weight_prefetch.py
@@ -47,6 +47,7 @@ class WeightPrefetchMethod:

    def __init__(self, weight_prefetch_config: WeightPrefetchConfig) -> None:
        self.is_moe = is_moe_model(get_current_vllm_config())
+        self.mla_sfa_prefetch_enable = weight_prefetch_config.enabled

        self.attn = ModuleWeightPrefetchConfig(
            module_name="attn",
@@ -94,6 +95,9 @@ class WeightPrefetchMethod:
        if not self.moe.is_active_this_forward:
            return
        forward_context = get_forward_context()
+        if not forward_context or forward_context.model_instance is None:
+            return
+
        # layer_idx is subtracted by 1 because layer_idx was incremented by 1 at layernorm.
        weight = forward_context.model_instance.model.layers[forward_context.layer_idx - 1].mlp.experts.w13_weight
        weight_size = weight.data.element_size() * weight.data.numel() * self.moe.prefetch_ratio.get(prefix, 0)
@@ -184,6 +188,33 @@ class WeightPrefetchMethod:
            forward_context.prefetch_mlp_gate_up_proj = False
            forward_context.prefetch_mlp_down_proj = False

+    def maybe_prefetch_mla_or_sla_weight_in_current_stream(
+        self,
+        inputs: torch.Tensor,
+        dependency: torch.Tensor,
+        max_size: int = 0,
+        linear_layer: torch.nn.Module | None = None,
+    ) -> None:
+        if not self.mla_sfa_prefetch_enable:
+            return
+
+        # The prefetching of the weights of the o_proj matrix in the W8A8
+        # scene is already performed once in AscendW8A8LinearMethod, so it
+        # is not needed here.
+        if linear_layer is not None:
+            from vllm_ascend.quantization.methods import AscendW8A8LinearMethod
+
+            if isinstance(
+                getattr(linear_layer.quant_method, "quant_method", None),
+                AscendW8A8LinearMethod,
+            ):
+                return
+
+        input_size = inputs.element_size() * inputs.numel()
+        if max_size <= 0 or max_size > input_size:
+            max_size = input_size
+        torch.ops.vllm.prefetch_preprocess(weight=inputs, start_flag=dependency, max_weight_size=int(max_size))
+

 def maybe_npu_prefetch(
    inputs: torch.Tensor, dependency: torch.Tensor, max_size: int = 0, offset: int = 0, *, enabled: bool = True