Add Custom Kernels For LoRA Performance (#1884)
### What this PR does / why we need it?
Add two custom kernels(bgmv_shrink and bgmv expand) to solve the
performance of LoRA
### Does this PR introduce _any_ user-facing change?
no user-facing change
### How was this patch tested?
we add Unit Test file to test the custom ascendc kernel. See
vllm-ascend/tests/e2e/singlecard/ops/test_bgmv_expand.py and
vllm-ascend/tests/e2e/singlecard/ops/test_bgmv_expand.py
Based on the actual test of the QWen2.5 7B model using vllm-ascend
version v0.9.2.rc1, the TTFT, TPOT and throughput have increased by
about 70%.
- vLLM version: v0.9.2
- vLLM main:
40d86ee412
---------
Signed-off-by: taoxudonghaha <justsheldon@163.com>
This commit is contained in:
28
csrc/ops.h
28
csrc/ops.h
@@ -60,4 +60,32 @@ namespace vllm_ascend {
|
||||
auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
|
||||
return new_tensor;
|
||||
}
|
||||
|
||||
extern void bgmv_shrink_impl(
|
||||
AscendType type,
|
||||
void *stream,
|
||||
void *x,
|
||||
void *weight,
|
||||
void *indices,
|
||||
void *y,
|
||||
uint32_t batch_size,
|
||||
uint32_t num_tokens_per_core,
|
||||
uint32_t input_hidden_dim,
|
||||
uint32_t lora_rank,
|
||||
float scale);
|
||||
|
||||
extern void bgmv_expand_impl(
|
||||
AscendType type,
|
||||
void *stream,
|
||||
void *x,
|
||||
void *weight,
|
||||
void *indices,
|
||||
void *y,
|
||||
void *y_out,
|
||||
uint32_t batch_size,
|
||||
uint32_t num_tokens_per_core,
|
||||
uint32_t lora_rank,
|
||||
uint32_t output_hidden_dim,
|
||||
uint32_t slice_offset,
|
||||
uint32_t output_full_dim);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user