v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/csrc/kernels/bgmv_expand.cpp
+++ b/csrc/kernels/bgmv_expand.cpp
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class BGMVExpand {
+public:
+    using X_T = float;
+    using W_T = scalar_t;
+    using Y_T = scalar_t;
+
+    static constexpr uint64_t LORA_RANK_8 = 8;
+    static constexpr uint64_t LORA_RANK_16 = 16;
+    static constexpr uint64_t LORA_RANK_32 = 32;
+    static constexpr uint64_t LORA_RANK_64 = 64;
+    static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
+    static constexpr int32_t BUFFER_NUM = 2;
+
+    // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
+    static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
+    static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
+    // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
+    static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Mask is used to control the elements that participate in computation in each iteration.
+    static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
+    static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
+    static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
+    static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
+    // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. 
+    // So need to read them all and apply PairReduceSum
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = 
+        (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+    // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
+    // Same for rank=64, we do not support ranks greater than 64.
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
+
+public:
+    __aicore__ inline BGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
+
+    __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,
+                                uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                                uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+    {
+        batchSize_ = batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        maxLoRARank_ = maxLoRARank;
+        outputHiddenDim_ = outputHiddenDim;
+        sliceOffset_ = sliceOffset;
+        outputFullDim_ = outputFullDim;
+        singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
+        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);
+
+        pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
+        pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+        pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+
+        pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+
+        // Each compute iteration would generate not one, but several output elements.
+        // Therefore, the following variable would determine how many output elements are calculated in each iteration.
+        numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
+        numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
+
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            yOffset_ = outputFullDim_ * idx + sliceOffset_;
+
+            // Set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            CopyInX(idx);
+            int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+            for (int32_t i = 0; i < numStreamOut; i++) {
+                CopyInY(i);
+                for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
+                    CopyInW(i * numStreamInPerOutputTile_ + j);
+                    Compute(j * numOutputElementsPerInputTile_);
+                }
+                ScaleOutput();
+                CopyOut(i);
+            }
+            ComputeLastIteration();
+        }
+    }
+
+private:
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // Look up the LoRA index
+        reqLoRAIndex_ = indicesGm_.GetValue(idx);
+    }
+
+    __aicore__ inline void ComputeLastIteration()
+    {
+        int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
+        if (remainingY == 0) {
+            return;
+        }
+        int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+        int32_t remainingW = remainingY * maxLoRARank_;
+        int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
+        int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
+
+        CopyInY(numStreamOut, remainingY);
+
+        int32_t outputIdx = 0;
+        for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
+            Compute(outputIdx * numOutputElementsPerInputTile_);
+        }
+
+        if (remainingWForLastRepeat != 0) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
+                    remainingWForLastRepeat);
+            int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat16 = 
+                (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
+            int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
+            Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
+        }
+
+        ScaleOutput(remainingY);
+        CopyOut(numStreamOut, remainingY);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        if constexpr (std::is_same_v<X_T, float>) {
+            DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
+        } else {
+            uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
+            DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
+        }
+        inQueueX_.EnQue(xLocal);
+        xLocal = inQueueX_.DeQue<X_T>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+
+        // As we are generating multiple output elements with one API invocation,
+        // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
+        if constexpr (std::is_same_v<X_T, float>) {
+            for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xLocal.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        } else {
+            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
+            pipe_barrier(PIPE_V);
+
+            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xDup.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        }
+        inQueueX_.FreeTensor(xLocal);
+    }
+
+    __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
+        DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
+        inQueueY_.EnQue(yInLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
+        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
+        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
+        pipe_barrier(PIPE_V);
+        inQueueY_.FreeTensor(yInLocal);
+
+        Add(yLocal, yLocal, yInLocalFP32, numElements);
+        pipe_barrier(PIPE_V);
+
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void Compute(int32_t progress,
+                                   int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
+                                   int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
+                                   int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
+        pipe_barrier(PIPE_V);
+        inQueueW_.FreeTensor(wLocal);
+
+        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
+        pipe_barrier(PIPE_V);
+
+        if (maxLoRARank_ == LORA_RANK_8) {
+            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_16) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_32) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_64) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        }
+    }
+
+    __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe* pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<Y_T> yInGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    AscendC::GlobalTensor<int64_t> indicesGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t maxLoRARank_;
+    uint32_t outputHiddenDim_;
+    uint32_t sliceOffset_;
+    uint32_t outputFullDim_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    uint32_t numOutputElementsPerInputTile_;
+    uint32_t numStreamInPerOutputTile_;
+    uint64_t yOffset_;
+
+    // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
+    // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
+    // reads next 8 consecutive blocks in the second repeat.
+    AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
+
+    // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
+    // so we set dstRepStride = 1
+    AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
+
+    // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
+    // For xDup we repeatedly use it, so we set src0RepStride = 0
+    AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
+
+};
+
+#define BGMV_EXPAND_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void bgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
+                                                             uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,\
+                                                             uint32_t batchSize, uint32_t numTokensPerCore,            \
+                                                             uint32_t maxLoRARank, uint32_t outputHiddenDim,           \
+                                                             uint32_t sliceOffset, uint32_t outputFullDim)             \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        BGMVExpand<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore, maxLoRARank,                  \
+                outputHiddenDim, sliceOffset, outputFullDim);                                                          \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+BGMV_EXPAND_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
+                             void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                             uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore,
+                                                        maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
+                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
+                                                                  sliceOffset, outputFullDim);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/bgmv_shrink.cpp
+++ b/csrc/kernels/bgmv_shrink.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class BGMVShrink {
+public:
+    using X_T = scalar_t;
+    using W_T = scalar_t;
+    using Y_T = float;
+
+    static constexpr uint64_t BUFFER_NUM = 1;
+    static constexpr uint64_t TILE_LENGTH = 11776;  // optimal performance tile length
+
+public:
+    __aicore__ inline BGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
+    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *indices, uint32_t indicesSize, __gm__ void *y,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                                uint32_t maxLoRARank, float scale)
+    {
+        batchSize_ =  batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        inputHiddenDim_ = inputHiddenDim;
+        maxLoRARank_ = maxLoRARank;
+        scale_ = scale;
+        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
+        incremental_ = inputHiddenDim_ > TILE_LENGTH;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);
+
+        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
+        pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));
+        
+        pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
+        pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            // set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            if (incremental_) {
+                ProcessImpl<true>(idx);
+            } else {
+                ProcessImpl<false>(idx);
+            }
+
+            ScaleOutput();
+            CopyOut(idx);
+        }
+    }
+
+private:
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void ProcessImpl(const int64_t idx)
+    {
+        AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
+        if constexpr (!INCREMENTAL_MODE) {
+            CopyInX(idx, 0, inputHiddenDim_);
+            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+        }
+
+        for (int i = 0; i < maxLoRARank_; i++) {
+            float acc(0);
+            for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
+                if constexpr (INCREMENTAL_MODE) {
+                    CopyInX(idx, j);
+                }
+                CopyInW(i, j);
+                Compute<INCREMENTAL_MODE>(acc);
+            }
+            CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
+            yOutLocal.SetValue(i, acc);
+        }
+    }
+
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // look up the LoRA index
+        reqLoRAIndex_ = indicesGm_.GetValue(idx);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
+        inQueueX_.EnQue(xLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        if constexpr (INCREMENTAL_MODE) {
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+            inQueueW_.FreeTensor(wLocal);
+        } else {
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueW_.FreeTensor(wLocal);
+        }
+        // dot product of the one tile of X and W 
+        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+        // reduce sum generate one number, which is the summation of all the dot product
+        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+
+        acc += wTmpTensor.GetValue(0);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
+    {
+        int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
+        int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
+        if (remaining == 0) {
+            return;
+        }
+        if constexpr (INCREMENTAL_MODE) {
+            CopyInX(idx, colIdx, remaining);
+        }
+        CopyInW(rowIdx, colIdx, remaining);
+        Compute<INCREMENTAL_MODE>(acc, remaining);
+    }
+
+    __aicore__ inline void ScaleOutput()
+    {
+        AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+
+        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void CopyOut(const int64_t idx)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe *pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<int64_t> indicesGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t inputHiddenDim_;
+    uint32_t maxLoRARank_;
+    float scale_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    bool incremental_;
+};
+
+#define BGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void bgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
+                                                             uint32_t indicesSize, __gm__ void* y, uint32_t batchSize, \
+                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
+                                                             uint32_t maxLoRARank, float scale)                        \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        BGMVShrink<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);  \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+BGMV_SHRINK_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    BGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
+                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                             uint32_t maxLoRARank, float scale)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
+                                                        inputHiddenDim, maxLoRARank, scale);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
+                                                                  inputHiddenDim, maxLoRARank, scale);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/get_masked_input_and_mask_kernel.cpp
+++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp
@@ -0,0 +1,378 @@
+/* 
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "kernel_tensor_impl.h"
+#include "kernel_type.h"
+#include "types.h"
+#include "utils.h"
+using vllm_ascend::AccType;
+
+template<typename scalar_t>
+class GetMaskedInputAndMask {
+public:
+    __aicore__ inline GetMaskedInputAndMask() {}
+    
+    __aicore__ inline ~GetMaskedInputAndMask() {
+        pipe.Reset();
+    }
+
+    
+    __aicore__ inline void Init(
+        __gm__ scalar_t* input,
+        __gm__ scalar_t* masked_input, 
+        __gm__ bool* mask_out,
+        const int64_t org_vocab_start_index,
+        const int64_t org_vocab_end_index,
+        const int64_t num_org_vocab_padding,
+        const int64_t added_vocab_start_index,
+        const int64_t added_vocab_end_index,
+        const int64_t size)
+    {
+        // Initialize basic parameters
+        input_ = input;
+        masked_input_ = masked_input;
+        mask_out_ = mask_out;
+        org_vocab_start_index_ = org_vocab_start_index;
+        org_vocab_end_index_ = org_vocab_end_index;
+        size_ = ((size + 31) / 32) * 32;
+        added_offset_ = added_vocab_start_index - 
+            (org_vocab_end_index - org_vocab_start_index) - 
+            num_org_vocab_padding;
+        added_vocab_start_index_ = added_vocab_start_index;
+        added_vocab_end_index_ = added_vocab_end_index;
+
+        // Initialize global tensors
+        inputGlobal.SetGlobalBuffer(input);
+        maskedOutputGlobal.SetGlobalBuffer(masked_input); 
+        maskOutGlobal.SetGlobalBuffer(mask_out);
+
+        // Initialize queues
+        pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t));
+        pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t));
+        pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool));
+        
+        // Initialize calculation buffers
+        // NOTE: calc_buf_1 and calc_buf_2 are also used for int16 casting on older archs.
+        pipe.InitBuffer(calc_buf_1, size_ * sizeof(float));
+        pipe.InitBuffer(calc_buf_2, size_ * sizeof(float));
+        
+        // Initialize result queues
+        pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float));
+
+        // Initialize temporary buffers
+        pipe.InitBuffer(start_buf, size_ * sizeof(float));
+        pipe.InitBuffer(end_buf, size_ * sizeof(float));
+        pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); // Also used for half intermediate in casting
+        pipe.InitBuffer(validOffset_buf, size_ * sizeof(float));
+        pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t));
+        pipe.InitBuffer(ones_buf_, size_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.AllocTensor<scalar_t>();
+        AscendC::DataCopy(inputLocal, inputGlobal, size_);
+        inQueue.EnQue(inputLocal);
+    }
+
+    __aicore__ inline void CompareWithValue(
+        AscendC::LocalTensor<int8_t>& result,
+        const AscendC::LocalTensor<float>& input,
+        const AscendC::LocalTensor<float>& compare_value,
+        bool is_greater_equal) {
+
+        AscendC::LocalTensor<float> compute_buf = calc_buf_1.Get<float>();
+        if (is_greater_equal) {
+            AscendC::Max(compute_buf, input, compare_value, size_);  
+            AscendC::Sub(compute_buf, compare_value, compute_buf, size_);  
+        } else {
+            AscendC::Max(compute_buf, input, compare_value, size_); 
+            AscendC::Sub(compute_buf, compute_buf, compare_value, size_); 
+        }
+
+        AscendC::Abs(compute_buf, compute_buf, size_);
+        AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_);
+        AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_);
+        AscendC::Abs(compute_buf, compute_buf, size_);
+
+        AscendC::LocalTensor<half> compute_buf_fp16 = calc_buf_2.Get<half>();
+        AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_);
+    }
+
+    __aicore__ inline void ComputeRangeMask(
+        AscendC::LocalTensor<int8_t>& range_mask,
+        const AscendC::LocalTensor<float>& input,
+        const float start_value, 
+        const float end_value) {
+        
+        AscendC::LocalTensor<float> start_value_tensor = start_buf.Get<float>();
+        AscendC::LocalTensor<float> end_value_tensor = end_buf.Get<float>();
+
+        AscendC::Duplicate(start_value_tensor, start_value, size_);
+        AscendC::Duplicate(end_value_tensor, end_value, size_);
+        
+        AscendC::LocalTensor<int8_t> ge_result = result_ge_que.AllocTensor<int8_t>();
+        AscendC::LocalTensor<int8_t> lt_result = result_le_que.AllocTensor<int8_t>();
+
+        CompareWithValue(ge_result, start_value_tensor, input, true);
+        CompareWithValue(lt_result, input, end_value_tensor, false);
+        
+#if (__CCE_AICORE__ >= 220) 
+        AscendC::And(range_mask, ge_result, lt_result, size_);
+#else
+        {
+            // WORKAROUND for older arch
+            // No direct int8->int16 cast. Use half as intermediate.
+            // No direct int8 And. Use int16 And.
+            AscendC::LocalTensor<int16_t> ge_result_i16 = calc_buf_1.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> lt_result_i16 = calc_buf_2.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> range_mask_i16 = ge_result_i16; 
+            
+            // Use a temporary buffer for half type
+            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
+
+            // 1. Cast inputs: int8_t -> half -> int16_t
+            AscendC::Cast(tmp_half, ge_result, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(ge_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+            
+            AscendC::Cast(tmp_half, lt_result, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(lt_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            // 2. Perform And on int16_t tensors
+            AscendC::And(range_mask_i16, ge_result_i16, lt_result_i16, size_);
+
+            // 3. Cast result back: int16_t -> half -> int8_t
+            AscendC::Cast(tmp_half, range_mask_i16, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(range_mask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+        }
+#endif
+    }
+
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.DeQue<scalar_t>();
+        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.AllocTensor<scalar_t>();
+        AscendC::LocalTensor<int8_t> maskLocal = maskQueue.AllocTensor<int8_t>();
+
+        AscendC::LocalTensor<float> inputFloat = inputFloat_buf.Get<float>();
+        AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::LocalTensor<int8_t> orgVocabMask = result_org_mask_que.AllocTensor<int8_t>();
+        ComputeRangeMask(orgVocabMask, 
+                        inputFloat,
+                        static_cast<float>(org_vocab_start_index_),
+                        static_cast<float>(org_vocab_end_index_));
+
+        AscendC::LocalTensor<int8_t> addedVocabMask = result_add_mask_que.AllocTensor<int8_t>();
+        ComputeRangeMask(addedVocabMask,
+                        inputFloat,
+                        static_cast<float>(added_vocab_start_index_),
+                        static_cast<float>(added_vocab_end_index_));
+
+        AscendC::LocalTensor<float> validOffset = validOffset_buf.Get<float>();
+        AscendC::LocalTensor<float> constOrgStartIndex = start_buf.Get<float>();
+        
+        AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_);
+        
+        AscendC::LocalTensor<half> orgVocabMask_fp16;
+        AscendC::LocalTensor<float> orgVocabMask_fp32;
+        AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::Mul(validOffset, constOrgStartIndex, orgVocabMask_fp32, size_);
+
+        AscendC::LocalTensor<float> addedOffset;
+        AscendC::LocalTensor<float> addedOffsetTensor = end_buf.Get<float>();
+        AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_);
+
+        AscendC::LocalTensor<half> addedVocabMask_fp16;
+        AscendC::LocalTensor<float> addedVocabMask_fp32;
+        AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::Mul(addedOffset, addedOffsetTensor, addedVocabMask_fp32, size_);
+        AscendC::Add(validOffset, validOffset, addedOffset, size_);
+
+        AscendC::LocalTensor<int8_t> vocabMask = vocabMask_buf_.Get<int8_t>();
+        
+#if (__CCE_AICORE__ >= 220)
+        AscendC::Or(vocabMask,
+                    orgVocabMask,
+                    addedVocabMask,
+                    size_);
+#else
+        {
+            // WORKAROUND for older arch 
+            // No direct int8->int16 cast. Use half as intermediate.
+            // No direct int8 Or. Use int16 Or.
+            AscendC::LocalTensor<int16_t> orgVocabMask_i16 = calc_buf_1.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> addedVocabMask_i16 = calc_buf_2.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> vocabMask_i16 = orgVocabMask_i16; 
+
+            // Use a temporary buffer for half type. inputFloat_buf is free now.
+            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
+
+            // 1. Cast inputs: int8_t -> half -> int16_t
+            AscendC::Cast(tmp_half, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(orgVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            AscendC::Cast(tmp_half, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(addedVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            // 2. Perform Or on int16_t tensors
+            AscendC::Or(vocabMask_i16, orgVocabMask_i16, addedVocabMask_i16, size_);
+
+            // 3. Cast result back: int16_t -> half -> int8_t
+            AscendC::Cast(tmp_half, vocabMask_i16, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(vocabMask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+        }
+#endif
+
+        AscendC::Sub(inputFloat, inputFloat, validOffset, size_);
+
+        AscendC::LocalTensor<half> vocabMask_fp16;
+        AscendC::LocalTensor<float> vocabMask_fp32;
+        AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+        
+        AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_);
+
+        AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_);  
+        outQueue.EnQue(maskedLocal);
+
+        AscendC::LocalTensor<float> ones_tensor = ones_buf_.Get<float>();
+        AscendC::Duplicate(ones_tensor, (float)1, size_);
+        AscendC::LocalTensor<float> maskLocal_fp32;
+
+        AscendC::Sub(maskLocal_fp32, ones_tensor, vocabMask_fp32, size_);
+
+        AscendC::LocalTensor<half> maskLocal_fp16;
+        AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_);
+        maskQueue.EnQue(maskLocal);
+        inQueue.FreeTensor(inputLocal);
+    }
+
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.DeQue<scalar_t>();
+        AscendC::LocalTensor<bool> maskLocal = maskQueue.DeQue<bool>();
+        
+        AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_);
+        AscendC::DataCopy(maskOutGlobal, maskLocal, size_);
+        
+        outQueue.FreeTensor(maskedLocal);
+        maskQueue.FreeTensor(maskLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue, maskQueue;
+    AscendC::GlobalTensor<scalar_t> inputGlobal, maskedOutputGlobal;
+    AscendC::GlobalTensor<bool> maskOutGlobal;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_1;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_2;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_ge_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_le_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_org_mask_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_add_mask_que;
+
+    // Temporary buffers
+    AscendC::TBuf<AscendC::TPosition::VECCALC> start_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> end_buf; 
+    AscendC::TBuf<AscendC::TPosition::VECCALC> inputFloat_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> validOffset_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> vocabMask_buf_;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> ones_buf_;
+    
+    __gm__ scalar_t *input_, *masked_input_;
+    __gm__ bool *mask_out_;
+    int64_t size_;
+    int64_t org_vocab_start_index_, org_vocab_end_index_;
+    int64_t added_vocab_start_index_, added_vocab_end_index_;
+    int64_t added_offset_;
+
+    static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38;
+    static constexpr float MAX_MUL_1_FP32 = 1125899906842624;
+    static constexpr float MAX_MUL_2_FP32 = 67108864;
+    static constexpr float NEGATIVE_ONE_FP32 = -1.0f;
+};
+
+extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel(
+    __gm__ int32_t* input,
+    __gm__ int32_t* masked_input,
+    __gm__ bool* mask_out, 
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding,
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index,
+    const int64_t size,
+    const uint32_t loop_cnt,
+    const uint32_t aiv_num)
+{
+    {
+        GetMaskedInputAndMask<int32_t> op{};
+
+        for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) {
+            op.Init(input + i * size/loop_cnt, 
+                   masked_input + i * size/loop_cnt,
+                   mask_out + i * size/loop_cnt,
+                   org_vocab_start_index, org_vocab_end_index,
+                   num_org_vocab_padding, added_vocab_start_index,
+                   added_vocab_end_index, size/loop_cnt);
+                
+            op.Process();
+        }
+    } // op destructor called here
+}
+
+namespace vllm_ascend {
+
+void get_masked_input_and_mask_impl(
+    void* stream,
+    void* input,
+    void* masked_input,
+    void* mask_out,
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding, 
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index,
+    const int64_t size,
+    const uint32_t loop_cnt,
+    const uint32_t aiv_num)
+{
+    get_masked_input_and_mask_kernel<<<aiv_num, nullptr, stream>>>(
+        static_cast<int32_t*>(input),
+        static_cast<int32_t*>(masked_input),
+        static_cast<bool*>(mask_out),
+        org_vocab_start_index,
+        org_vocab_end_index,
+        num_org_vocab_padding,
+        added_vocab_start_index,
+        added_vocab_end_index,
+        size,
+        loop_cnt,
+        aiv_num);
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/pos_encoding_kernels.cpp
+++ b/csrc/kernels/pos_encoding_kernels.cpp
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include <stdio.h>
+#include "types.h"
+#include "utils.h"
+
+
+using vllm_ascend::AccType;
+using vllm_ascend::local_mem_copy;
+template <typename scalar_t, bool isNeox> class RotaryEmbedding {
+    // NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to
+    // retrieve this size from runtime for more Soc support
+    #if (__CCE_AICORE__ >= 220)
+        static int constexpr loadSize = 512;
+    #else
+        static int constexpr loadSize = 1024 * 4;
+    #endif
+    using dst_t = scalar_t;
+    using acc_t = typename AccType<scalar_t>::type;
+    // only half tensor have cast instruct to int8, hardcode acc_dst_t as half
+    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
+    using local_acc_t = AscendC::LocalTensor<acc_t>;
+    using local_dst_t = AscendC::LocalTensor<dst_t>;
+
+public:
+    __aicore__ inline RotaryEmbedding()
+    {
+    }
+
+    // Allocate buffers for input and output queue and the temp buffer used during kernel compute process,
+    // this init process happens only in the kernel compute on a single vector core.
+    __aicore__ inline void init(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst,
+                                __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache,
+                                const int rotDim, const int64_t dstQueryStride,
+                                const int64_t dstKeyStride, const int64_t queryStride, const int64_t keyStride,
+                                const int numHeads, const int numKvHeads, const int headSize, AscendC::TPipe *pipe)
+    {
+        pipe_ = pipe;
+        rotDim_ = rotDim;
+        // query stride and key stride is used to handle the strided tensor which is not contiguous on num_tokens dim
+        queryStride_ = queryStride;
+        keyStride_ = keyStride;
+        dstQueryStride_ = dstQueryStride;
+        dstKeyStride_ = dstKeyStride;
+        numHeads_ = numHeads;
+        numKvHeads_ = numKvHeads;
+        headSize_ = headSize;
+        embedDim_ = rotDim / 2;
+
+        pipe_->InitBuffer(inQue_, 1 /* buffer_num */, loadSize /* buffer_size */);
+        pipe_->InitBuffer(inQueSinCos_, 1 /* buffer_num */, rotDim_ * sizeof(scalar_t) /* buffer_size */);
+        pipe_->InitBuffer(outQue_, 1 /* buffer_num */, loadSize /* buffer_size */);
+        // 2 temporary calculation buffer
+        calcTmpBufferOffset_ = 0;
+        // 1 upcast buffer for bf16 (headSize)
+        upcastInputBufferOffset_ = calcTmpBufferOffset_ + sizeof(acc_t) * embedDim_ * 2;
+        // 1 upcast temp buffer for bf16 (2 * embed_dim)
+        upcastTempBufferOffset_ = upcastInputBufferOffset_ + sizeof(acc_t) * headSize_;
+        // 2 sin cos upcast buffer for bf16
+        cosSinUpcastBufferOffset_ = upcastTempBufferOffset_ + sizeof(acc_t) * 2 * embedDim_;
+        // 2. bf16 path: needs 2 cos sin upcast buffer size
+        // 3. fp16 path: needs 2 temporary calculation buffer size
+        tempBufferSize_ = cosSinUpcastBufferOffset_ + 2 * embedDim_ * sizeof(acc_t);
+        // need to consider upcast the bf16 to fp32, so we might need 4 buffer just in case
+        // 2 temporary buffer, 2 input buffer, 1 cos buffer, 1 sin buffer, 2 scale buffer (headSize), 2 zp
+        // buffer(headSize int8), 1 dst_temp buffer(headSize, int32)
+        pipe_->InitBuffer(calcBuf_, tempBufferSize_ /* buffer_size */);
+        if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+            pipe_->InitBuffer(copyBuf_, loadSize);
+        }
+    }
+    __aicore__ inline void update_mem_offset(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst,
+                                  __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache,
+                                  const int rotDim, const int64_t dstQueryStride, const int64_t dstKeyStride,
+                                  const int64_t queryStride, const int64_t keyStride, const int numHeads,
+                                  const int numKvHeads, const int headSize, const int64_t idx)
+    {
+        int64_t pos = positions[idx];
+        cosSin_.SetGlobalBuffer(cosSinCache + pos * rotDim_, rotDim_);
+        query_.SetGlobalBuffer(query + queryStride * idx, headSize * numHeads_);
+        key_.SetGlobalBuffer(key + keyStride * idx, headSize * numKvHeads_);
+        queryDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(queryDst) + dstQueryStride * idx,
+                                  headSize * numHeads_);
+        keyDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(keyDst) + dstKeyStride * idx, headSize * numKvHeads_);
+    }
+
+    // compute per head for neox on bf16
+    template <typename acc_t_, typename std::enable_if<!std::is_same_v<acc_t_, scalar_t>, void>::type * = nullptr>
+    __aicore__ inline void
+    neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor<acc_t_> sin, AscendC::LocalTensor<acc_t_> cos,
+                 AscendC::LocalTensor<acc_t_> upcastInputBuffer, AscendC::LocalTensor<acc_t_> calcTmpBuffer)
+    {
+        // slice dst
+        local_dst_t dstX = dst;
+        local_dst_t dstY = dst[embedDim_];
+
+        // slice src
+        local_scalar_t srcX = src;
+        local_scalar_t srcY = src[embedDim_];
+
+        // slice temp buffer
+        local_acc_t calcTmpBufferX = calcTmpBuffer;
+        local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_];
+
+        // slice upcast input buffer
+        local_acc_t upcastBufferX = upcastInputBuffer;
+        local_acc_t upcastBufferY = upcastBufferX[embedDim_];
+
+        // dst x calc
+        Cast(upcastInputBuffer, src, AscendC::RoundMode::CAST_NONE, headSize_);
+        Mul(calcTmpBufferX, upcastBufferX, cos, embedDim_);
+        Mul(calcTmpBufferY, upcastBufferY, sin, embedDim_);
+        Sub(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_);
+        Cast(dstX, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_);
+
+        // dst y calc
+        Mul(calcTmpBufferX, upcastBufferX, sin, embedDim_);
+        Mul(calcTmpBufferY, upcastBufferY, cos, embedDim_);
+        Add(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_);
+        Cast(dstY, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_);
+    }
+
+    // compute per head output for neox
+    template <typename acc_t_, typename std::enable_if<std::is_same_v<acc_t_, scalar_t>, void>::type * = nullptr>
+    __aicore__ inline void
+    neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor<acc_t_> sin, AscendC::LocalTensor<acc_t_> cos,
+                 AscendC::LocalTensor<acc_t_> upcastInputBuffer, AscendC::LocalTensor<acc_t_> calcTmpBuffer)
+    {
+        // slice dst buffer
+        local_dst_t dstX = dst;
+        local_dst_t dstY = dst[embedDim_];
+        // slice src buffer
+        local_scalar_t srcX = src;
+        local_scalar_t srcY = src[embedDim_];
+        // slice temp buffer
+        local_acc_t calcTmpBufferX = calcTmpBuffer;
+        local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_];
+
+        // dst x calc
+        Mul(calcTmpBufferX, srcX, cos, embedDim_);
+        Mul(calcTmpBufferY, srcY, sin, embedDim_);
+        Sub(dstX, calcTmpBufferX, calcTmpBufferY, embedDim_);
+
+        // dst y calc
+        Mul(calcTmpBufferX, srcX, sin, embedDim_);
+        Mul(calcTmpBufferY, srcY, cos, embedDim_);
+        Add(dstY, calcTmpBufferX, calcTmpBufferY, embedDim_);
+    }
+
+    __aicore__ inline void compute_qk(AscendC::GlobalTensor<scalar_t> srcG, AscendC::GlobalTensor<dst_t> dstG,
+                                          local_acc_t localCos, local_acc_t localSin, local_acc_t upcastInputBuffer,
+                                          local_acc_t calcTmpBuffer, int loopCnt, int tailHeads, int loadStride,
+                                          int headNumPerLoad)
+    {
+        for (int loopNum = 0; loopNum < loopCnt; ++loopNum) {
+            local_scalar_t src = inQue_.AllocTensor<scalar_t>();
+            local_dst_t dst = outQue_.AllocTensor<dst_t>();
+            AscendC::DataCopy(src, srcG[loopNum * loadStride], loadStride);
+            inQue_.EnQue(src);
+
+            local_scalar_t srcDeque = inQue_.DeQue<scalar_t>();
+            if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+                int elem_num = loadStride / sizeof(scalar_t);
+                AscendC::LocalTensor<acc_t> upBuffer = copyBuf_.GetWithOffset<acc_t>(elem_num, 0);
+                Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num);
+                Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num);
+            } else {
+                local_mem_copy(dst, srcDeque, loadStride);
+            }
+            for (int i = 0; i < headNumPerLoad; ++i) {
+                neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer,
+                             calcTmpBuffer);
+            }
+            outQue_.EnQue(dst);
+            local_dst_t dstDeque = outQue_.DeQue<dst_t>();
+            AscendC::DataCopy(dstG[loopNum * loadStride], dstDeque, loadStride);
+            outQue_.FreeTensor(dstDeque);
+            inQue_.FreeTensor(srcDeque);
+        }
+        // process tail
+        {
+            local_scalar_t src = inQue_.AllocTensor<scalar_t>();
+            local_dst_t dst = outQue_.AllocTensor<dst_t>();
+
+            AscendC::DataCopy(src, srcG[loopCnt * loadStride], tailHeads * headSize_);
+            inQue_.EnQue(src);
+            local_scalar_t srcDeque = inQue_.DeQue<scalar_t>();
+
+            if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+                int elem_num = tailHeads * headSize_ / sizeof(scalar_t);
+                AscendC::LocalTensor<acc_t> upBuffer = copyBuf_.GetWithOffset<acc_t>(elem_num, 0);
+                Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num);
+                Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num);
+            } else {
+                local_mem_copy(dst, srcDeque, tailHeads * headSize_);
+            }
+
+            for (int i = 0; i < tailHeads; ++i) {
+                neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer,
+                             calcTmpBuffer);
+            }
+            outQue_.EnQue(dst);
+            local_dst_t dstDeque = outQue_.DeQue<dst_t>();
+            AscendC::DataCopy(dstG[loopCnt * loadStride], dstDeque, tailHeads * headSize_);
+            outQue_.FreeTensor(dstDeque);
+            inQue_.FreeTensor(srcDeque);
+        }
+    }
+
+    __aicore__ inline void compute_function()
+    {
+        local_scalar_t cosSinLocal = inQueSinCos_.AllocTensor<scalar_t>();
+
+        AscendC::DataCopy(cosSinLocal, cosSin_, embedDim_ * 2);
+
+        inQueSinCos_.EnQue(cosSinLocal);
+        local_scalar_t localSinCosDeque = inQueSinCos_.DeQue<scalar_t>();
+        local_scalar_t localCos = localSinCosDeque;
+        local_scalar_t localSin = localSinCosDeque[embedDim_];
+
+        local_acc_t calcTmpBuffer;
+        local_acc_t upcastInputBuffer;
+        local_acc_t upcastTempBuffer;
+        local_acc_t cosSinUpcastBuffer;
+        local_acc_t scaleBuffer;
+        local_acc_t offsetBuffer;
+        calcTmpBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, calcTmpBufferOffset_);
+        upcastInputBuffer = calcBuf_.GetWithOffset<acc_t>(headSize_, upcastInputBufferOffset_);
+        upcastTempBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, upcastTempBufferOffset_);
+        cosSinUpcastBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, cosSinUpcastBufferOffset_);
+
+        local_acc_t cosAccBuffer;
+        local_acc_t sinAccBuffer;
+
+        if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+            Cast(cosSinUpcastBuffer, localSinCosDeque, AscendC::RoundMode::CAST_NONE, 2 * embedDim_);
+            cosAccBuffer = cosSinUpcastBuffer;
+            sinAccBuffer = cosSinUpcastBuffer[embedDim_];
+        } else {
+            cosAccBuffer = localCos;
+            sinAccBuffer = localSin;
+        }
+
+        constexpr const int loadSizeByElem = loadSize / sizeof(scalar_t);
+        int64_t headNumPerLoad = loadSizeByElem / headSize_;
+        int64_t loopCnt = numHeads_ / headNumPerLoad;
+        int64_t tailHeads = numHeads_ - loopCnt * headNumPerLoad;
+        int64_t loadStride = headNumPerLoad * headSize_;
+        int64_t loopCntKv = numKvHeads_ / headNumPerLoad;
+        int64_t tailHeadsKv = numKvHeads_ - loopCntKv * headNumPerLoad;
+        compute_qk(query_, queryDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer,
+                       calcTmpBuffer, loopCnt, tailHeads, loadStride, headNumPerLoad);
+
+        compute_qk(key_, keyDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer, calcTmpBuffer,
+                       loopCntKv, tailHeadsKv, loadStride, headNumPerLoad);
+
+        inQueSinCos_.FreeTensor(localSinCosDeque);
+    }
+
+private:
+    AscendC::TPipe *pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQue_, inQueSinCos_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQue_;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calcBuf_;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> copyBuf_;
+    AscendC::GlobalTensor<dst_t> queryDst_;
+    AscendC::GlobalTensor<dst_t> keyDst_;
+    AscendC::GlobalTensor<scalar_t> query_;
+    AscendC::GlobalTensor<scalar_t> key_;
+    AscendC::GlobalTensor<scalar_t> cosSin_;
+    int rotDim_;
+    int embedDim_;
+    int64_t queryStride_;
+    int64_t keyStride_;
+    int64_t dstQueryStride_;
+    int64_t dstKeyStride_;
+    int numHeads_;
+    int numKvHeads_;
+    int headSize_;
+    int calcTmpBufferOffset_;
+    int upcastInputBufferOffset_;
+    int upcastTempBufferOffset_;
+    int cosSinUpcastBufferOffset_;
+    int tempBufferSize_;
+};
+
+// Note: Need to use macro to instaniate all the target functions here, for the current build system dose not support template call in cpp
+// We use C style symbol here for kernel compilation, cpp style kernel entry may lead to compilation failure
+#define ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, NEOX)                                                                            \
+    extern "C" __global__ __aicore__ void rope_custom_##NEOX##_##TYPE(                                                          \
+        __gm__ int64_t* positions, __gm__ void* queryDst, __gm__ void* keyDst, __gm__ TYPE* query, __gm__ TYPE* key,            \
+        __gm__ TYPE* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride,                         \
+        const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads,                     \
+        const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)                                      \
+    {                                                                                                                           \
+        AscendC::TPipe pipe;                                                                                                    \
+        RotaryEmbedding<TYPE, NEOX> op{};                                                                                       \
+        op.init(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride,                     \
+                queryStride, keyStride, numHeads, numKvHeads, headSize, &pipe);                                                 \
+        for (int64_t i = AscendC::GetBlockIdx(); i < numTokens; i += coreNum) {                                                 \
+            op.update_mem_offset(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride,    \
+                      queryStride, keyStride, numHeads, numKvHeads, headSize, i);                                               \
+            op.compute_function();                                                                                              \
+        }                                                                                                                       \
+    }
+
+#define ROPE_CUSTOM_KERNEL_DECLARE(TYPE)    \
+    ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, true); \
+    ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, false);
+
+// Declare all the kernel entry here
+ROPE_CUSTOM_KERNEL_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+
+#define ROTARY_EMBEDDING_KERNEL_CALL(TYPE)                                                                       \
+    if (isNeox)                                                                                                  \
+        rope_custom_true_##TYPE<<<blockDim, nullptr, stream>>>(                                                  \
+            positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key),         \
+            reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
+            numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);                                       \
+    else                                                                                                         \
+        rope_custom_false_##TYPE<<<blockDim, nullptr, stream>>>(                                                 \
+            positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key),         \
+            reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
+            numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);
+
+// maximum number for runtime to launch a ascendc kernel.
+// we use this to constrain the maximum number of block size
+static const int64_t maxParallelSize = 65535;
+
+extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst,
+                                    void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim,
+                                    const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride,
+                                    const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
+                                    const int headSize, const int64_t numTokens, const uint32_t loopCnt,
+                                    uint32_t aivNum)
+{
+
+    int blockDim = maxParallelSize > numTokens ? numTokens : maxParallelSize;
+    if (type == AscendType::FP16) {
+        ROTARY_EMBEDDING_KERNEL_CALL(half);
+    }
+    #if (__CCE_AICORE__ >= 220)
+    else if (type == AscendType::BF16) {
+        ROTARY_EMBEDDING_KERNEL_CALL(bfloat16_t);
+    }
+    #endif
+    else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/sgmv_expand.cpp
+++ b/csrc/kernels/sgmv_expand.cpp
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class SGMVExpand {
+public:
+    using X_T = float;
+    using W_T = scalar_t;
+    using Y_T = scalar_t;
+
+    static constexpr uint64_t LORA_RANK_8 = 8;
+    static constexpr uint64_t LORA_RANK_16 = 16;
+    static constexpr uint64_t LORA_RANK_32 = 32;
+    static constexpr uint64_t LORA_RANK_64 = 64;
+    static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
+    static constexpr int32_t BUFFER_NUM = 2;
+
+    // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
+    static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
+    static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
+    // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
+    static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Mask is used to control the elements that participate in computation in each iteration.
+    static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
+    static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
+    static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
+    static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
+    // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. 
+    // So need to read them all and apply PairReduceSum
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = 
+        (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+    // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
+    // Same for rank=64, we do not support ranks greater than 64.
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
+
+public:
+    __aicore__ inline SGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
+
+    __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* loraIndices, uint32_t loraIndicesSize,
+                                __gm__ void* seqLen, uint32_t seqLenSize, __gm__ void* yIn, __gm__ void* yOut,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                                uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+    {
+        batchSize_ = batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        maxLoRARank_ = maxLoRARank;
+        outputHiddenDim_ = outputHiddenDim;
+        sliceOffset_ = sliceOffset;
+        outputFullDim_ = outputFullDim;
+        singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
+        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize);
+        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize);
+
+        pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
+        pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+        pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+
+        pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+
+        // Each compute iteration would generate not one, but several output elements.
+        // Therefore, the following variable would determine how many output elements are calculated in each iteration.
+        numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
+        numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
+
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            yOffset_ = outputFullDim_ * idx + sliceOffset_;
+
+            // Set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            CopyInX(idx);
+            int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+            for (int32_t i = 0; i < numStreamOut; i++) {
+                CopyInY(i);
+                for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
+                    CopyInW(i * numStreamInPerOutputTile_ + j);
+                    Compute(j * numOutputElementsPerInputTile_);
+                }
+                ScaleOutput();
+                CopyOut(i);
+            }
+            ComputeLastIteration();
+        }
+    }
+
+private:
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // Look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
+    __aicore__ inline void ComputeLastIteration()
+    {
+        int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
+        if (remainingY == 0) {
+            return;
+        }
+        int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+        int32_t remainingW = remainingY * maxLoRARank_;
+        int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
+        int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
+
+        CopyInY(numStreamOut, remainingY);
+
+        int32_t outputIdx = 0;
+        for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
+            Compute(outputIdx * numOutputElementsPerInputTile_);
+        }
+
+        if (remainingWForLastRepeat != 0) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
+                    remainingWForLastRepeat);
+            int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat16 = 
+                (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
+            int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
+            Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
+        }
+
+        ScaleOutput(remainingY);
+        CopyOut(numStreamOut, remainingY);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        if constexpr (std::is_same_v<X_T, float>) {
+            DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
+        } else {
+            uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
+            DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
+        }
+        inQueueX_.EnQue(xLocal);
+        xLocal = inQueueX_.DeQue<X_T>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+
+        // As we are generating multiple output elements with one API invocation,
+        // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
+        if constexpr (std::is_same_v<X_T, float>) {
+            for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xLocal.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        } else {
+            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
+            pipe_barrier(PIPE_V);
+
+            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xDup.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        }
+        inQueueX_.FreeTensor(xLocal);
+    }
+
+    __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
+        DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
+        inQueueY_.EnQue(yInLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
+        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
+        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
+        pipe_barrier(PIPE_V);
+        inQueueY_.FreeTensor(yInLocal);
+
+        Add(yLocal, yLocal, yInLocalFP32, numElements);
+        pipe_barrier(PIPE_V);
+
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void Compute(int32_t progress,
+                                   int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
+                                   int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
+                                   int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
+        pipe_barrier(PIPE_V);
+        inQueueW_.FreeTensor(wLocal);
+
+        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
+        pipe_barrier(PIPE_V);
+
+        if (maxLoRARank_ == LORA_RANK_8) {
+            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_16) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_32) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_64) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        }
+    }
+
+    __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe* pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<Y_T> yInGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    AscendC::GlobalTensor<int64_t> loraIndicesGm_;
+    AscendC::GlobalTensor<int64_t> seqLenGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t maxLoRARank_;
+    uint32_t outputHiddenDim_;
+    uint32_t sliceOffset_;
+    uint32_t outputFullDim_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    uint32_t numOutputElementsPerInputTile_;
+    uint32_t numStreamInPerOutputTile_;
+    uint64_t yOffset_;
+
+    // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
+    // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
+    // reads next 8 consecutive blocks in the second repeat.
+    AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
+
+    // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
+    // so we set dstRepStride = 1
+    AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
+
+    // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
+    // For xDup we repeatedly use it, so we set src0RepStride = 0
+    AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
+
+};
+
+#define SGMV_EXPAND_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void sgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight,                      \
+                                                             __gm__ void* loraIndices, uint32_t loraIndicesSize,       \
+                                                             __gm__ void* seqLen, uint32_t seqLenSize,                 \
+                                                             __gm__ void* yIn,  __gm__ void* yOut,                     \
+                                                             uint32_t batchSize, uint32_t numTokensPerCore,            \
+                                                             uint32_t maxLoRARank, uint32_t outputHiddenDim,           \
+                                                             uint32_t sliceOffset, uint32_t outputFullDim)             \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        SGMVExpand<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize,                                           \
+                yIn, yOut, batchSize, numTokensPerCore, maxLoRARank,                                                   \
+                outputHiddenDim, sliceOffset, outputFullDim);                                                          \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+SGMV_EXPAND_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    SGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void sgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, 
+                             void* loraIndices, uint32_t loraIndicesSize,
+                             void* seqLen, uint32_t seqLenSize,
+                             void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                             uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        sgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, 
+                                                        yIn, yOut, batchSize,
+                                                        numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset, 
+                                                        outputFullDim);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            sgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
+                                                                  seqLen, seqLenSize, yIn, yOut, batchSize,
+                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
+                                                                  sliceOffset, outputFullDim);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/sgmv_shrink.cpp
+++ b/csrc/kernels/sgmv_shrink.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class SGMVShrink {
+public:
+    using X_T = scalar_t;
+    using W_T = scalar_t;
+    using Y_T = float;
+
+    static constexpr uint64_t BUFFER_NUM = 1;
+    static constexpr uint64_t TILE_LENGTH = 11776;  // optimal performance tile length
+
+public:
+    __aicore__ inline SGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
+    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, uint32_t loraIndicesSize,
+                                __gm__ void *seqLen, uint32_t seqLenSize,
+                                __gm__ void *y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                                uint32_t maxLoRARank, float scale)
+    {
+        batchSize_ =  batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        inputHiddenDim_ = inputHiddenDim;
+        maxLoRARank_ = maxLoRARank;
+        scale_ = scale;
+        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
+        incremental_ = inputHiddenDim_ > TILE_LENGTH;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize);
+        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize);
+
+        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
+        pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));
+        
+        pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
+        pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            // set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            if (incremental_) {
+                ProcessImpl<true>(idx);
+            } else {
+                ProcessImpl<false>(idx);
+            }
+
+            ScaleOutput();
+            CopyOut(idx);
+        }
+    }
+
+private:
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void ProcessImpl(const int64_t idx)
+    {
+        AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
+        if constexpr (!INCREMENTAL_MODE) {
+            CopyInX(idx, 0, inputHiddenDim_);
+            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+        }
+
+        for (int i = 0; i < maxLoRARank_; i++) {
+            float acc(0);
+            for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
+                if constexpr (INCREMENTAL_MODE) {
+                    CopyInX(idx, j);
+                }
+                CopyInW(i, j);
+                Compute<INCREMENTAL_MODE>(acc);
+            }
+            CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
+            yOutLocal.SetValue(i, acc);
+        }
+    }
+
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
+        inQueueX_.EnQue(xLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        if constexpr (INCREMENTAL_MODE) {
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+            inQueueW_.FreeTensor(wLocal);
+        } else {
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueW_.FreeTensor(wLocal);
+        }
+        // dot product of the one tile of X and W 
+        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+        // reduce sum generate one number, which is the summation of all the dot product
+        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+
+        acc += wTmpTensor.GetValue(0);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
+    {
+        int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
+        int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
+        if (remaining == 0) {
+            return;
+        }
+        if constexpr (INCREMENTAL_MODE) {
+            CopyInX(idx, colIdx, remaining);
+        }
+        CopyInW(rowIdx, colIdx, remaining);
+        Compute<INCREMENTAL_MODE>(acc, remaining);
+    }
+
+    __aicore__ inline void ScaleOutput()
+    {
+        AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+
+        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void CopyOut(const int64_t idx)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe *pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<int64_t> loraIndicesGm_;
+    AscendC::GlobalTensor<int64_t> seqLenGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t inputHiddenDim_;
+    uint32_t maxLoRARank_;
+    float scale_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    bool incremental_;
+};
+
+#define SGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void sgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight,                      \
+                                                             __gm__ void* loraIndices, uint32_t loraIndicesSize,       \
+                                                             __gm__ void* seqLen, uint32_t seqLenSize,                 \
+                                                             __gm__ void* y, uint32_t batchSize,                       \
+                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
+                                                             uint32_t maxLoRARank, float scale)                        \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        SGMVShrink<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize,                                           \
+            y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);                                       \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+SGMV_SHRINK_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    SGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, 
+                             void* loraIndices, uint32_t loraIndicesSize,
+                             void* seqLen, uint32_t seqLenSize,
+                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                             uint32_t maxLoRARank, float scale)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        sgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, 
+                                                        y, batchSize, 
+                                                        numTokensPerCore, inputHiddenDim, maxLoRARank,
+                                                        scale);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
+                                                                  seqLen, seqLenSize, 
+                                                                  y, batchSize,
+                                                                  numTokensPerCore, inputHiddenDim, maxLoRARank,
+                                                                  scale);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/types.h
+++ b/csrc/kernels/types.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace vllm_ascend {
+enum struct AscendType {
+    FP16 = 0,
+    BF16 = 1,
+    FP32 = 2,
+};
+}
--- a/csrc/kernels/utils.h
+++ b/csrc/kernels/utils.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "kernel_type.h"
+namespace vllm_ascend {
+
+template <typename scalar_t> struct AccType;
+
+#if (__CCE_AICORE__ >= 220)
+template <> struct AccType<bfloat16_t> {
+  using type = float;
+};
+#endif
+
+template <> struct AccType<half> {
+    using type = half;
+};
+
+template <> struct AccType<float> {
+    using type = float;
+};
+
+template <> struct AccType<int8_t> {
+    using type = int;
+};
+
+template <typename scalar_t>
+__aicore__ inline void local_mem_copy(AscendC::LocalTensor<scalar_t> dst, AscendC::LocalTensor<scalar_t> src, int size)
+{
+    constexpr int loadSize = 256 / sizeof(scalar_t);
+    int loopCnt = size / loadSize;
+    int tailSize = size % loadSize;
+    if (loopCnt)
+        AscendC::Copy(dst, src, loadSize, loopCnt, {1, 1, 8, 8});
+    AscendC::Copy(dst[loopCnt * loadSize], src[loopCnt * loadSize], tailSize, 1, {1, 1, 8, 8});
+}
+} // namespace vllm_ascend