[Bugfix][LoRA][Operator] Fix LoRA custom operators accuracy issue (#2672)

### What this PR does / why we need it? Fix the LoRA accuracy issue that introduced by custom AscendC operator "bgmv_shrink, sgmv_shrink, bgmv_expand, sgmv_epand". The bug details are: - In the kernel function, if you want to call GlobalTensor.GetSize method, you have to pass the second parameter of bufferSize when you call GlobalTensor.SetGlobalBuffer first. - Or GlobalTensor.GetSize method will return a random value. - You can refer to [this doc](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha002/apiref/ascendcopapi/atlasascendc_api_07_00024.html). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? pytest -sv tests/e2e/singlecard/test_ilama_lora.py pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py - vLLM version: v0.10.1.1 - vLLM main: a344a5aa0a --------- Signed-off-by: paulyu12 <paulyu0307@gmail.com> Signed-off-by: paulyu12 <507435917@qq.com> Co-authored-by: paulyu12 <paulyu0307@gmail.com>
2025-09-02 11:46:59 +08:00
parent 214b32a346
commit 9f1e054fe3
9 changed files with 99 additions and 41 deletions
--- a/csrc/kernels/bgmv_shrink.cpp
+++ b/csrc/kernels/bgmv_shrink.cpp
@@ -29,7 +29,7 @@ public:

 public:
    __aicore__ inline BGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
-    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *indices, __gm__ void *y,
+    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *indices, uint32_t indicesSize, __gm__ void *y,
                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                                uint32_t maxLoRARank, float scale)
    {
@@ -44,7 +44,7 @@ public:
        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
-        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices);
+        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);

        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
@@ -214,13 +214,13 @@ private:

 #define BGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
    extern "C" __global__ __aicore__ void bgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
-                                                             __gm__ void* y, uint32_t batchSize,                       \
+                                                             uint32_t indicesSize, __gm__ void* y, uint32_t batchSize, \
                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
                                                             uint32_t maxLoRARank, float scale)                        \
    {                                                                                                                  \
        AscendC::TPipe pipe;                                                                                           \
        BGMVShrink<TYPE> op(&pipe);                                                                                    \
-        op.Init(x, weight, indices, y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);            \
+        op.Init(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);  \
        op.Process();                                                                                                  \
    }

@@ -231,17 +231,17 @@ BGMV_SHRINK_TYPE_DECLARE(half)
 #endif

 namespace vllm_ascend {
-extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* indices,
+extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                             uint32_t maxLoRARank, float scale)
 {
    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
    if (type == AscendType::FP16) {
-        bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, y, batchSize, numTokensPerCore, 
+        bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
                                                        inputHiddenDim, maxLoRARank, scale);
    } else if (type == AscendType::BF16) {
        #if (__CCE_AICORE__ >= 220)
-            bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, y, batchSize, numTokensPerCore, 
+            bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
                                                                  inputHiddenDim, maxLoRARank, scale);
        #endif
    } else {