[1/N][Feat] Add weight prefetch feature for Attention layers (#3146)
### What this PR does / why we need it?
- Refacotr and integrate a unified `WeightPrefetchMethod`
- Integrate `qkv_proj.weight` and `o_proj.weight` in quantized Attention
modules
- Prefetching these weights ahead of matmul-like operators imporves
performance by reducing L2 cache transfer latency
### Does this PR introduce _any_ user-facing change?
Add a new config in `--additional-config` for configuration:
```json
{
"weight_prefetch_config": {
"enabled": false,
"prefetch_ratio": {
"attn": {
"qkv": 1.0,
"o": 1.0,
},
},
},
}
```
This feature is enabled by default, and can be disabled through this
configuration
### How was this patch tested?
- vLLM version: v0.11.0
---------
Signed-off-by: yuzhup <15705211260@163.com>
Signed-off-by: zhoux77899 <zhouxiang100@huawei.com>
Co-authored-by: yuzhup <15705211260@163.com>
This commit is contained in:
@@ -21,6 +21,7 @@ import torch
|
||||
import torch_npu
|
||||
from vllm.attention.backends.abstract import AttentionType
|
||||
from vllm.distributed.parallel_state import get_ep_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.ops.moe.experts_selector import select_experts
|
||||
@@ -97,11 +98,32 @@ class AscendW8A8LinearMethod:
|
||||
tp_rank: Optional[int] = 0,
|
||||
) -> torch.Tensor:
|
||||
if x.dtype != torch.int8:
|
||||
attn_weight_map = {
|
||||
"AscendQKVParallelLinear": "qkv",
|
||||
"AscendRowParallelLinear": "o",
|
||||
}
|
||||
layer_cls_name = layer.__class__.__name__
|
||||
weight_prefetch_method = get_forward_context(
|
||||
).weight_prefetch_method
|
||||
assert weight_prefetch_method is not None
|
||||
|
||||
# prefetch_qkvo_proj.weight preprocess
|
||||
weight_prefetch_method.maybe_prefetch_attn_weight_preprocess(
|
||||
prefix=attn_weight_map.get(layer_cls_name, ""),
|
||||
weight=layer.weight,
|
||||
start_flag=x,
|
||||
)
|
||||
# quant
|
||||
x = quant_per_tensor(
|
||||
x,
|
||||
layer.aclnn_input_scale_reciprocal,
|
||||
layer.aclnn_input_offset,
|
||||
)
|
||||
# prefetch_qkvo_proj.weight postprocess
|
||||
if layer_cls_name in attn_weight_map.keys():
|
||||
weight_prefetch_method.maybe_prefetch_attn_weight_postprocess(
|
||||
x)
|
||||
|
||||
quant_bias = layer.quant_bias if tp_rank == 0 else None
|
||||
if is_310p():
|
||||
# On 300I Duo platform, we need transpose again if
|
||||
|
||||
Reference in New Issue
Block a user