[1/N][Feat] Add weight prefetch feature for Attention layers (#3146)

### What this PR does / why we need it? - Refacotr and integrate a unified `WeightPrefetchMethod` - Integrate `qkv_proj.weight` and `o_proj.weight` in quantized Attention modules - Prefetching these weights ahead of matmul-like operators imporves performance by reducing L2 cache transfer latency ### Does this PR introduce _any_ user-facing change? Add a new config in `--additional-config` for configuration: ```json { "weight_prefetch_config": { "enabled": false, "prefetch_ratio": { "attn": { "qkv": 1.0, "o": 1.0, }, }, }, } ``` This feature is enabled by default, and can be disabled through this configuration ### How was this patch tested? - vLLM version: v0.11.0 --------- Signed-off-by: yuzhup <15705211260@163.com> Signed-off-by: zhoux77899 <zhouxiang100@huawei.com> Co-authored-by: yuzhup <15705211260@163.com>
2025-10-09 20:38:39 +08:00
parent 23db56a340
commit ff37575936
13 changed files with 264 additions and 69 deletions
--- a/vllm_ascend/quantization/w8a8.py
+++ b/vllm_ascend/quantization/w8a8.py
@@ -21,6 +21,7 @@ import torch
 import torch_npu
 from vllm.attention.backends.abstract import AttentionType
 from vllm.distributed.parallel_state import get_ep_group
+from vllm.forward_context import get_forward_context

 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.moe.experts_selector import select_experts
@@ -97,11 +98,32 @@ class AscendW8A8LinearMethod:
        tp_rank: Optional[int] = 0,
    ) -> torch.Tensor:
        if x.dtype != torch.int8:
+            attn_weight_map = {
+                "AscendQKVParallelLinear": "qkv",
+                "AscendRowParallelLinear": "o",
+            }
+            layer_cls_name = layer.__class__.__name__
+            weight_prefetch_method = get_forward_context(
+            ).weight_prefetch_method
+            assert weight_prefetch_method is not None
+
+            # prefetch_qkvo_proj.weight preprocess
+            weight_prefetch_method.maybe_prefetch_attn_weight_preprocess(
+                prefix=attn_weight_map.get(layer_cls_name, ""),
+                weight=layer.weight,
+                start_flag=x,
+            )
+            # quant
            x = quant_per_tensor(
                x,
                layer.aclnn_input_scale_reciprocal,
                layer.aclnn_input_offset,
            )
+            # prefetch_qkvo_proj.weight postprocess
+            if layer_cls_name in attn_weight_map.keys():
+                weight_prefetch_method.maybe_prefetch_attn_weight_postprocess(
+                    x)
+
        quant_bias = layer.quant_bias if tp_rank == 0 else None
        if is_310p():
            # On 300I Duo platform, we need transpose again if