[BugFix]Fix precision issue for LoRA feature (#4141)
vLLM version: v0.11.0
vLLM main: vllm-project/vllm
### What this PR does / why we need it?
Fix the precision issue of the LoRA feature in vllm-ascend.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
```bash
pytest tests/lora/test_llama_tp.py::test_llama_lora -s
```
<img width="1319" height="879" alt="lora_test"
src="https://github.com/user-attachments/assets/2a0b2325-5b05-4bbc-ac03-a7c9f0ad9d4c"
/>
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: hukongyi <hukongyi@cmbchina.com>
This commit is contained in:
@@ -62,6 +62,10 @@ set(VLLM_ASCEND_CUSTOM_OP
|
|||||||
)
|
)
|
||||||
|
|
||||||
set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
|
set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
|
||||||
|
${KERNEL_FILES}/bgmv_expand.cpp
|
||||||
|
${KERNEL_FILES}/bgmv_shrink.cpp
|
||||||
|
${KERNEL_FILES}/sgmv_expand.cpp
|
||||||
|
${KERNEL_FILES}/sgmv_shrink.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -342,7 +342,7 @@ private:
|
|||||||
|
|
||||||
// declare all dtype kernel
|
// declare all dtype kernel
|
||||||
BGMV_EXPAND_TYPE_DECLARE(half)
|
BGMV_EXPAND_TYPE_DECLARE(half)
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
|
BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -356,7 +356,7 @@ extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weigh
|
|||||||
bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore,
|
bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore,
|
||||||
maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
|
maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
|
||||||
} else if (type == AscendType::BF16) {
|
} else if (type == AscendType::BF16) {
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
|
bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
|
||||||
numTokensPerCore, maxLoRARank, outputHiddenDim,
|
numTokensPerCore, maxLoRARank, outputHiddenDim,
|
||||||
sliceOffset, outputFullDim);
|
sliceOffset, outputFullDim);
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ private:
|
|||||||
|
|
||||||
// declare all dtype kernel
|
// declare all dtype kernel
|
||||||
BGMV_SHRINK_TYPE_DECLARE(half)
|
BGMV_SHRINK_TYPE_DECLARE(half)
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
BGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
|
BGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -240,7 +240,7 @@ extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weigh
|
|||||||
bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore,
|
bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore,
|
||||||
inputHiddenDim, maxLoRARank, scale);
|
inputHiddenDim, maxLoRARank, scale);
|
||||||
} else if (type == AscendType::BF16) {
|
} else if (type == AscendType::BF16) {
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore,
|
bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore,
|
||||||
inputHiddenDim, maxLoRARank, scale);
|
inputHiddenDim, maxLoRARank, scale);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -357,7 +357,7 @@ private:
|
|||||||
|
|
||||||
// declare all dtype kernel
|
// declare all dtype kernel
|
||||||
SGMV_EXPAND_TYPE_DECLARE(half)
|
SGMV_EXPAND_TYPE_DECLARE(half)
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
SGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
|
SGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -375,7 +375,7 @@ extern void sgmv_expand_impl(AscendType type, void* stream, void* x, void* weigh
|
|||||||
numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset,
|
numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset,
|
||||||
outputFullDim);
|
outputFullDim);
|
||||||
} else if (type == AscendType::BF16) {
|
} else if (type == AscendType::BF16) {
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
sgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize,
|
sgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize,
|
||||||
seqLen, seqLenSize, yIn, yOut, batchSize,
|
seqLen, seqLenSize, yIn, yOut, batchSize,
|
||||||
numTokensPerCore, maxLoRARank, outputHiddenDim,
|
numTokensPerCore, maxLoRARank, outputHiddenDim,
|
||||||
|
|||||||
@@ -242,7 +242,7 @@ private:
|
|||||||
|
|
||||||
// declare all dtype kernel
|
// declare all dtype kernel
|
||||||
SGMV_SHRINK_TYPE_DECLARE(half)
|
SGMV_SHRINK_TYPE_DECLARE(half)
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
SGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
|
SGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -260,7 +260,7 @@ extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weigh
|
|||||||
numTokensPerCore, inputHiddenDim, maxLoRARank,
|
numTokensPerCore, inputHiddenDim, maxLoRARank,
|
||||||
scale);
|
scale);
|
||||||
} else if (type == AscendType::BF16) {
|
} else if (type == AscendType::BF16) {
|
||||||
#if (__CCE_AICORE__ >= 220)
|
#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
|
||||||
sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize,
|
sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize,
|
||||||
seqLen, seqLenSize,
|
seqLen, seqLenSize,
|
||||||
y, batchSize,
|
y, batchSize,
|
||||||
|
|||||||
@@ -255,6 +255,7 @@ class PunicaWrapperNPU(PunicaWrapperBase):
|
|||||||
# Embedding layer only need expand op
|
# Embedding layer only need expand op
|
||||||
expand_fun: Callable = (self._expand_prefill
|
expand_fun: Callable = (self._expand_prefill
|
||||||
if self.is_prefill else self._expand_decode)
|
if self.is_prefill else self._expand_decode)
|
||||||
|
x = x.to(torch.float32)
|
||||||
expand_fun(y, x, lora_b_stacked, add_inputs)
|
expand_fun(y, x, lora_b_stacked, add_inputs)
|
||||||
|
|
||||||
def add_lora_linear(self,
|
def add_lora_linear(self,
|
||||||
|
|||||||
Reference in New Issue
Block a user