[Bugfix][LoRA][Operator] Fix LoRA custom operators accuracy issue (#2672)

### What this PR does / why we need it? Fix the LoRA accuracy issue that introduced by custom AscendC operator "bgmv_shrink, sgmv_shrink, bgmv_expand, sgmv_epand". The bug details are: - In the kernel function, if you want to call GlobalTensor.GetSize method, you have to pass the second parameter of bufferSize when you call GlobalTensor.SetGlobalBuffer first. - Or GlobalTensor.GetSize method will return a random value. - You can refer to [this doc](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha002/apiref/ascendcopapi/atlasascendc_api_07_00024.html). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? pytest -sv tests/e2e/singlecard/test_ilama_lora.py pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py - vLLM version: v0.10.1.1 - vLLM main: a344a5aa0a --------- Signed-off-by: paulyu12 <paulyu0307@gmail.com> Signed-off-by: paulyu12 <507435917@qq.com> Co-authored-by: paulyu12 <paulyu0307@gmail.com>
2025-09-02 11:46:59 +08:00
parent 214b32a346
commit 9f1e054fe3
9 changed files with 99 additions and 41 deletions
--- a/csrc/kernels/sgmv_shrink.cpp
+++ b/csrc/kernels/sgmv_shrink.cpp
@@ -29,7 +29,8 @@ public:

 public:
    __aicore__ inline SGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
-    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, __gm__ void *seqLen,
+    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, uint32_t loraIndicesSize,
+                                __gm__ void *seqLen, uint32_t seqLenSize,
                                __gm__ void *y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                                uint32_t maxLoRARank, float scale)
    {
@@ -44,8 +45,8 @@ public:
        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
-        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices);
-        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen);
+        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize);
+        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize);

        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
@@ -226,14 +227,16 @@ private:

 #define SGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
    extern "C" __global__ __aicore__ void sgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight,                      \
-                                                             __gm__ void* loraIndices, __gm__ void* seqLen,            \
+                                                             __gm__ void* loraIndices, uint32_t loraIndicesSize,       \
+                                                             __gm__ void* seqLen, uint32_t seqLenSize,                 \
                                                             __gm__ void* y, uint32_t batchSize,                       \
                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
                                                             uint32_t maxLoRARank, float scale)                        \
    {                                                                                                                  \
        AscendC::TPipe pipe;                                                                                           \
        SGMVShrink<TYPE> op(&pipe);                                                                                    \
-        op.Init(x, weight, loraIndices, seqLen,y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);    \
+        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize,                                           \
+            y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);                                       \
        op.Process();                                                                                                  \
    }

@@ -244,18 +247,23 @@ SGMV_SHRINK_TYPE_DECLARE(half)
 #endif

 namespace vllm_ascend {
-extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* loraIndices, void* seqLen,
+extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, 
+                             void* loraIndices, uint32_t loraIndicesSize,
+                             void* seqLen, uint32_t seqLenSize,
                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                             uint32_t maxLoRARank, float scale)
 {
    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
    if (type == AscendType::FP16) {
-        sgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, seqLen, y, batchSize, 
+        sgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, 
+                                                        y, batchSize, 
                                                        numTokensPerCore, inputHiddenDim, maxLoRARank,
                                                        scale);
    } else if (type == AscendType::BF16) {
        #if (__CCE_AICORE__ >= 220)
-            sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, seqLen, y, batchSize,
+            sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
+                                                                  seqLen, seqLenSize, 
+                                                                  y, batchSize,
                                                                  numTokensPerCore, inputHiddenDim, maxLoRARank,
                                                                  scale);
        #endif