[1/N][Feat] Add weight prefetch feature for Attention layers (#3146)

### What this PR does / why we need it? - Refacotr and integrate a unified `WeightPrefetchMethod` - Integrate `qkv_proj.weight` and `o_proj.weight` in quantized Attention modules - Prefetching these weights ahead of matmul-like operators imporves performance by reducing L2 cache transfer latency ### Does this PR introduce _any_ user-facing change? Add a new config in `--additional-config` for configuration: ```json { "weight_prefetch_config": { "enabled": false, "prefetch_ratio": { "attn": { "qkv": 1.0, "o": 1.0, }, }, }, } ``` This feature is enabled by default, and can be disabled through this configuration ### How was this patch tested? - vLLM version: v0.11.0 --------- Signed-off-by: yuzhup <15705211260@163.com> Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> Co-authored-by: yuzhup <15705211260@163.com>
2025-10-09 20:38:39 +08:00
parent 23db56a340
commit ff37575936
13 changed files with 264 additions and 69 deletions
--- a/vllm_ascend/torchair/torchair_mla.py
+++ b/vllm_ascend/torchair/torchair_mla.py
@@ -23,9 +23,9 @@ from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
+from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
 from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata,
                                        npu_stream_switch, npu_wait_tensor)
-from vllm_ascend.utils import npu_prefetch
 from vllm_ascend.worker.npu_input_batch import InputBatch

 if TYPE_CHECKING:
@@ -684,10 +684,10 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
        if hasattr(self, "running_in_graph") and not self.running_in_graph:
            return x
        MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024  # 16MB
-        npu_prefetch(self.o_proj.weight,
-                     x,
-                     max_size=MAX_O_PROJ_PREFETCH_SIZE,
-                     enabled=enable_multistream_mla)
+        maybe_npu_prefetch(self.o_proj.weight,
+                           x,
+                           max_size=MAX_O_PROJ_PREFETCH_SIZE,
+                           enabled=enable_multistream_mla)
        return self.o_proj(x, is_prefill=False)[0]

    # Return `ql_nope`, `q_pe`
@@ -1281,10 +1281,10 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
        current_ms_metadata = get_multistream_comm_context()
        MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024  # 16MB
        if current_ms_metadata is None:
-            npu_prefetch(self.o_proj.weight,
-                         o_proj_input,
-                         max_size=MAX_O_PROJ_PREFETCH_SIZE,
-                         enabled=enable_multistream_mla)
+            maybe_npu_prefetch(self.o_proj.weight,
+                               o_proj_input,
+                               max_size=MAX_O_PROJ_PREFETCH_SIZE,
+                               enabled=enable_multistream_mla)

            output[...] = self.o_proj(
                o_proj_input,
@@ -1292,10 +1292,10 @@ class AscendMLATorchairImpl(MLAAttentionImpl):
                is_force_scatter=self.enable_shared_expert_dp)[0]
        else:
            with torch.npu.stream(current_ms_metadata.comm_stream):
-                npu_prefetch(self.o_proj.weight,
-                             o_proj_input,
-                             max_size=MAX_O_PROJ_PREFETCH_SIZE,
-                             enabled=enable_multistream_mla)
+                maybe_npu_prefetch(self.o_proj.weight,
+                                   o_proj_input,
+                                   max_size=MAX_O_PROJ_PREFETCH_SIZE,
+                                   enabled=enable_multistream_mla)
                output[...] = self.o_proj(
                    o_proj_input,
                    is_prefill=True,