[BugFix]Fix precision issue for LoRA feature (#4141)

vLLM version: v0.11.0 vLLM main: vllm-project/vllm ### What this PR does / why we need it? Fix the precision issue of the LoRA feature in vllm-ascend. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ```bash pytest tests/lora/test_llama_tp.py::test_llama_lora -s ``` <img width="1319" height="879" alt="lora_test" src="https://github.com/user-attachments/assets/2a0b2325-5b05-4bbc-ac03-a7c9f0ad9d4c" /> - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: hukongyi <hukongyi@cmbchina.com>
2025-12-19 14:22:06 +08:00
parent f952de93df
commit ea8f544ce7
6 changed files with 17 additions and 12 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,10 @@ set(VLLM_ASCEND_CUSTOM_OP
 )

 set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
+    ${KERNEL_FILES}/bgmv_expand.cpp
+    ${KERNEL_FILES}/bgmv_shrink.cpp
+    ${KERNEL_FILES}/sgmv_expand.cpp
+    ${KERNEL_FILES}/sgmv_shrink.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
 )

--- a/csrc/kernels/bgmv_expand.cpp
+++ b/csrc/kernels/bgmv_expand.cpp
@@ -342,7 +342,7 @@ private:

 // declare all dtype kernel
 BGMV_EXPAND_TYPE_DECLARE(half)
-#if (__CCE_AICORE__ >= 220)
+#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
    BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
 #endif

@@ -356,8 +356,8 @@ extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weigh
        bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore,
                                                        maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
    } else if (type == AscendType::BF16) {
-        #if (__CCE_AICORE__ >= 220)
-            bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
+        #if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
+        bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
                                                                  sliceOffset, outputFullDim);
        #endif
--- a/csrc/kernels/bgmv_shrink.cpp
+++ b/csrc/kernels/bgmv_shrink.cpp
@@ -226,7 +226,7 @@ private:

 // declare all dtype kernel
 BGMV_SHRINK_TYPE_DECLARE(half)
-#if (__CCE_AICORE__ >= 220)
+#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
    BGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
 #endif

@@ -240,8 +240,8 @@ extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weigh
        bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
                                                        inputHiddenDim, maxLoRARank, scale);
    } else if (type == AscendType::BF16) {
-        #if (__CCE_AICORE__ >= 220)
-            bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
+        #if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
+        bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
                                                                  inputHiddenDim, maxLoRARank, scale);
        #endif
    } else {
--- a/csrc/kernels/sgmv_expand.cpp
+++ b/csrc/kernels/sgmv_expand.cpp
@@ -357,7 +357,7 @@ private:

 // declare all dtype kernel
 SGMV_EXPAND_TYPE_DECLARE(half)
-#if (__CCE_AICORE__ >= 220)
+#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
    SGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
 #endif

@@ -375,8 +375,8 @@ extern void sgmv_expand_impl(AscendType type, void* stream, void* x, void* weigh
                                                        numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset, 
                                                        outputFullDim);
    } else if (type == AscendType::BF16) {
-        #if (__CCE_AICORE__ >= 220)
-            sgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
+        #if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
+        sgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
                                                                  seqLen, seqLenSize, yIn, yOut, batchSize,
                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
                                                                  sliceOffset, outputFullDim);
--- a/csrc/kernels/sgmv_shrink.cpp
+++ b/csrc/kernels/sgmv_shrink.cpp
@@ -242,7 +242,7 @@ private:

 // declare all dtype kernel
 SGMV_SHRINK_TYPE_DECLARE(half)
-#if (__CCE_AICORE__ >= 220)
+#if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
    SGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
 #endif

@@ -260,8 +260,8 @@ extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weigh
                                                        numTokensPerCore, inputHiddenDim, maxLoRARank,
                                                        scale);
    } else if (type == AscendType::BF16) {
-        #if (__CCE_AICORE__ >= 220)
-            sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
+        #if !defined(__CCE_AICORE__) || (__CCE_AICORE__ >= 220)
+        sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
                                                                  seqLen, seqLenSize, 
                                                                  y, batchSize,
                                                                  numTokensPerCore, inputHiddenDim, maxLoRARank,
--- a/vllm_ascend/lora/punica_npu.py
+++ b/vllm_ascend/lora/punica_npu.py
@@ -255,6 +255,7 @@ class PunicaWrapperNPU(PunicaWrapperBase):
        # Embedding layer only need expand op
        expand_fun: Callable = (self._expand_prefill
                                if self.is_prefill else self._expand_decode)
+        x = x.to(torch.float32)
        expand_fun(y, x, lora_b_stacked, add_inputs)

    def add_lora_linear(self,