[Refact]Refact MLA/SFA weight prefetch to consist with moe weight prefetch (#6629)

### What this PR does / why we need it? 1. [Refact] Refact MLA/SFA weight prefetch to consist with moe weight prefetch 2. Remove duplicated o_proj weight prefetch in forward for MLA/SFA ### Does this PR introduce _any_ user-facing change? NA ### How was this patch tested? 1) Performance result: Perf test data: *) MLA: | | 1st test | 2nd test | Output Token Throughput(Avg) | Performance improvement percentage | | --- | --- | --- | --- | --- | | o_proj duplicate prefetch | 11.9669 token/s | 12.0287 token/s | 11.9978 | | o_proj no duplicate prefetch | 12.5594 token/s | 12.6216 token/s | 12.5905 | 4.94%| | single layer performace improve: 5%~8% *) SFA: | | 1st test | 2nd test | Output Token Throughput(Avg) | Performance improvement percentage | | --- | --- | --- | --- | --- | | o_proj duplicate prefetch | 13.0523 token/s | 13.1084 token/s | 13.08035 | | | o_proj no duplicate prefetch | 13.9844 token/s | 14.1678 token/s | 14.0761 | 7.6% | - vLLM version: v0.15.0 - vLLM main: d7e17aaacd --------- Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-02-10 14:14:37 +08:00
parent 2a826b5fad
commit 66b60c9440
15 changed files with 98 additions and 56 deletions
--- a/vllm_ascend/ops/weight_prefetch.py
+++ b/vllm_ascend/ops/weight_prefetch.py
@@ -47,6 +47,7 @@ class WeightPrefetchMethod:

    def __init__(self, weight_prefetch_config: WeightPrefetchConfig) -> None:
        self.is_moe = is_moe_model(get_current_vllm_config())
+        self.mla_sfa_prefetch_enable = weight_prefetch_config.enabled

        self.attn = ModuleWeightPrefetchConfig(
            module_name="attn",
@@ -94,6 +95,9 @@ class WeightPrefetchMethod:
        if not self.moe.is_active_this_forward:
            return
        forward_context = get_forward_context()
+        if not forward_context or forward_context.model_instance is None:
+            return
+
        # layer_idx is subtracted by 1 because layer_idx was incremented by 1 at layernorm.
        weight = forward_context.model_instance.model.layers[forward_context.layer_idx - 1].mlp.experts.w13_weight
        weight_size = weight.data.element_size() * weight.data.numel() * self.moe.prefetch_ratio.get(prefix, 0)
@@ -184,6 +188,33 @@ class WeightPrefetchMethod:
            forward_context.prefetch_mlp_gate_up_proj = False
            forward_context.prefetch_mlp_down_proj = False

+    def maybe_prefetch_mla_or_sla_weight_in_current_stream(
+        self,
+        inputs: torch.Tensor,
+        dependency: torch.Tensor,
+        max_size: int = 0,
+        linear_layer: torch.nn.Module | None = None,
+    ) -> None:
+        if not self.mla_sfa_prefetch_enable:
+            return
+
+        # The prefetching of the weights of the o_proj matrix in the W8A8
+        # scene is already performed once in AscendW8A8LinearMethod, so it
+        # is not needed here.
+        if linear_layer is not None:
+            from vllm_ascend.quantization.methods import AscendW8A8LinearMethod
+
+            if isinstance(
+                getattr(linear_layer.quant_method, "quant_method", None),
+                AscendW8A8LinearMethod,
+            ):
+                return
+
+        input_size = inputs.element_size() * inputs.numel()
+        if max_size <= 0 or max_size > input_size:
+            max_size = input_size
+        torch.ops.vllm.prefetch_preprocess(weight=inputs, start_flag=dependency, max_weight_size=int(max_size))
+

 def maybe_npu_prefetch(
    inputs: torch.Tensor, dependency: torch.Tensor, max_size: int = 0, offset: int = 0, *, enabled: bool = True