Add Custom Kernels For LoRA Performance (#2325)
### What this PR does / why we need it?
Add two custom operators (sgmv_shrink and sgmv_expand) to address the
performance issues of LoRA. Meanwhile, enable the graph mode for LoRA
operators to enter ACL, so as to improve the model inference
performance.
### Does this PR introduce _any_ user-facing change?
no user-facing change
### How was this patch tested?
Based on the actual test of the QWen2.5 7B model using vllm-ascend
version v0.9.2.rc1, in acl graph mode, the TTFT, TPOT and throughput
have increased by about 100%.
Signed-off-by: liuchn <909698896@qq.com>
- vLLM version: v0.10.0
- vLLM main:
1f83e7d849
---------
Signed-off-by: liuchn <909698896@qq.com>
Co-authored-by: liuchn <909698896@qq.com>
This commit is contained in:
@@ -69,6 +69,18 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask_meta(
|
||||
return {masked_input, mask};
|
||||
}
|
||||
|
||||
at::Tensor bgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y,
|
||||
int64_t slice_offset, int64_t slice_size) {
|
||||
at::Tensor y_out = at::empty_like(y);
|
||||
return y_out;
|
||||
}
|
||||
|
||||
at::Tensor sgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
|
||||
at::Tensor &y, int64_t slice_offset, int64_t slice_size) {
|
||||
at::Tensor y_out = at::empty_like(y);
|
||||
return y_out;
|
||||
}
|
||||
|
||||
|
||||
} // namespace meta
|
||||
} // namespace vllm_ascend
|
||||
@@ -81,6 +93,10 @@ namespace {
|
||||
ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta);
|
||||
// Masked input and mask meta implementation
|
||||
ops.impl("get_masked_input_and_mask", &vllm_ascend::meta::get_masked_input_and_mask_meta);
|
||||
// Bgmv expand
|
||||
ops.impl("bgmv_expand", &vllm_ascend::meta::bgmv_expand_meta);
|
||||
// Sgmv expand
|
||||
ops.impl("sgmv_expand", &vllm_ascend::meta::sgmv_expand_meta);
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user