From 3648d18e673f15a33a82d6ea95d3a9dd891ff1f5 Mon Sep 17 00:00:00 2001 From: liuchenbing Date: Tue, 19 Aug 2025 09:09:11 +0800 Subject: [PATCH] Add Custom Kernels For LoRA Performance (#2325) ### What this PR does / why we need it? Add two custom operators (sgmv_shrink and sgmv_expand) to address the performance issues of LoRA. Meanwhile, enable the graph mode for LoRA operators to enter ACL, so as to improve the model inference performance. ### Does this PR introduce _any_ user-facing change? no user-facing change ### How was this patch tested? Based on the actual test of the QWen2.5 7B model using vllm-ascend version v0.9.2.rc1, in acl graph mode, the TTFT, TPOT and throughput have increased by about 100%. Signed-off-by: liuchn <909698896@qq.com> - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/1f83e7d849ccb03990bb896f49df20343a2828b9 --------- Signed-off-by: liuchn <909698896@qq.com> Co-authored-by: liuchn <909698896@qq.com> --- csrc/kernels/sgmv_expand.cpp | 383 ++++++++++++++++++ csrc/kernels/sgmv_shrink.cpp | 267 ++++++++++++ csrc/ops.h | 30 ++ csrc/torch_binding.cpp | 89 ++++ csrc/torch_binding_meta.cpp | 16 + vllm_ascend/lora/punica_wrapper/lora_ops.py | 46 ++- vllm_ascend/lora/punica_wrapper/punica_npu.py | 22 +- vllm_ascend/meta_registration.py | 23 ++ 8 files changed, 847 insertions(+), 29 deletions(-) create mode 100644 csrc/kernels/sgmv_expand.cpp create mode 100644 csrc/kernels/sgmv_shrink.cpp diff --git a/csrc/kernels/sgmv_expand.cpp b/csrc/kernels/sgmv_expand.cpp new file mode 100644 index 0000000..95e7bb8 --- /dev/null +++ b/csrc/kernels/sgmv_expand.cpp @@ -0,0 +1,383 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_operator.h" +#include "types.h" + +template +class SGMVExpand { +public: + using X_T = float; + using W_T = scalar_t; + using Y_T = scalar_t; + + static constexpr uint64_t LORA_RANK_8 = 8; + static constexpr uint64_t LORA_RANK_16 = 16; + static constexpr uint64_t LORA_RANK_32 = 32; + static constexpr uint64_t LORA_RANK_64 = 64; + static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64}; + static constexpr int32_t BUFFER_NUM = 2; + + // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time. + static constexpr int32_t NUM_BYTES_PER_REPEAT = 256; + static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8; + // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type). + static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float); + // Mask is used to control the elements that participate in computation in each iteration. + static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float); + // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants. + static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192; + static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096; + static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT; + // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. + // So need to read them all and apply PairReduceSum + static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = + (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT; + // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16. + // Same for rank=64, we do not support ranks greater than 64. + static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2; + +public: + __aicore__ inline SGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {} + + __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* loraIndices, + __gm__ void* seqLen, __gm__ void* yIn, __gm__ void* yOut, + uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank, + uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim) + { + batchSize_ = batchSize; + numTokensPerCore_ = numTokensPerCore; + maxLoRARank_ = maxLoRARank; + outputHiddenDim_ = outputHiddenDim; + sliceOffset_ = sliceOffset; + outputFullDim_ = outputFullDim; + singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_; + + xGm_.SetGlobalBuffer((__gm__ X_T *)x); + wGm_.SetGlobalBuffer((__gm__ W_T *)weight); + yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn); + yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut); + loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices); + seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen); + + pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T)); + pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T)); + pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T)); + pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T)); + + pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float)); + pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float)); + pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float)); + pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float)); + + // Each compute iteration would generate not one, but several output elements. + // Therefore, the following variable would determine how many output elements are calculated in each iteration. + numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_); + numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_; + + } + + __aicore__ inline void Process() + { + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t startIdx = blockIdx * numTokensPerCore_; + int64_t endIdx = startIdx + numTokensPerCore_; + if (endIdx > batchSize_) { + endIdx = batchSize_; + } + for (int64_t idx = startIdx; idx < endIdx; idx++) { + yOffset_ = outputFullDim_ * idx + sliceOffset_; + + // Set up LoRA index + CopyInIndex(idx); + if (reqLoRAIndex_ < 0) { + continue; + } + reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_; + + CopyInX(idx); + int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS; + for (int32_t i = 0; i < numStreamOut; i++) { + CopyInY(i); + for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) { + CopyInW(i * numStreamInPerOutputTile_ + j); + Compute(j * numOutputElementsPerInputTile_); + } + ScaleOutput(); + CopyOut(i); + } + ComputeLastIteration(); + } + } + +private: + __aicore__ inline void CopyInIndex(const int64_t idx) + { + // Look up the LoRA index + int64_t weightIdx = idx; + uint64_t i = 0; + for (; i < seqLenGm_.GetSize(); i++) { + int64_t repeatValue = seqLenGm_.GetValue(i); + if (weightIdx >= repeatValue) { + weightIdx -= repeatValue; + continue; + } + break; + } + reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1; + } + + __aicore__ inline void ComputeLastIteration() + { + int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS; + if (remainingY == 0) { + return; + } + int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS; + int32_t remainingW = remainingY * maxLoRARank_; + int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS; + int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS; + + CopyInY(numStreamOut, remainingY); + + int32_t outputIdx = 0; + for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) { + CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx); + Compute(outputIdx * numOutputElementsPerInputTile_); + } + + if (remainingWForLastRepeat != 0) { + CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration, + remainingWForLastRepeat); + int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT; + int32_t pairReduceRepeat16 = + (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT; + int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2; + int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_; + Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32); + } + + ScaleOutput(remainingY); + CopyOut(numStreamOut, remainingY); + } + + __aicore__ inline void CopyInX(const int64_t idx) + { + AscendC::LocalTensor xLocal = inQueueX_.AllocTensor(); + if constexpr (std::is_same_v) { + DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_); + } else { + uint16_t blockLen = static_cast(maxLoRARank_ * sizeof(X_T)); + DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {}); + } + inQueueX_.EnQue(xLocal); + xLocal = inQueueX_.DeQue(); + AscendC::LocalTensor xDup = dupBufferX_.Get(); + + // As we are generating multiple output elements with one API invocation, + // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT + if constexpr (std::is_same_v) { + for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { + for (int32_t j = 0; j < maxLoRARank_; j++) { + float entry = xLocal.GetValue(j); + xDup.SetValue(i + j, entry); + } + } + } else { + Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_); + pipe_barrier(PIPE_V); + + for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { + for (int32_t j = 0; j < maxLoRARank_; j++) { + float entry = xDup.GetValue(j); + xDup.SetValue(i + j, entry); + } + } + } + inQueueX_.FreeTensor(xLocal); + } + + __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yInLocal = inQueueY_.AllocTensor(); + DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements); + inQueueY_.EnQue(yInLocal); + } + + __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor wLocal = inQueueW_.AllocTensor(); + DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements); + inQueueW_.EnQue(wLocal); + } + + __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yLocal = tmpBufferY_.Get(); + AscendC::LocalTensor yInLocal = inQueueY_.DeQue(); + AscendC::LocalTensor yInLocalFP32 = inBufferY_.Get(); + Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueY_.FreeTensor(yInLocal); + + Add(yLocal, yLocal, yInLocalFP32, numElements); + pipe_barrier(PIPE_V); + + AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); + Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements); + pipe_barrier(PIPE_V); + + outQueueY_.EnQue(yOutLocal); + } + + __aicore__ inline void Compute(int32_t progress, + int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS, + int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16, + int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32) + { + AscendC::LocalTensor yLocal = tmpBufferY_.Get(); + AscendC::LocalTensor xDup = dupBufferX_.Get(); + AscendC::LocalTensor wLocal = inQueueW_.DeQue(); + AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); + + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_); + pipe_barrier(PIPE_V); + inQueueW_.FreeTensor(wLocal); + + Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_); + pipe_barrier(PIPE_V); + + if (maxLoRARank_ == LORA_RANK_8) { + BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_16) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_32) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_64) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } + } + + __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yOutLocal = outQueueY_.DeQue(); + DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements); + outQueueY_.FreeTensor(yOutLocal); + } + +private: + AscendC::TPipe* pipe_; + AscendC::TQue inQueueY_, inQueueW_; + AscendC::TQue inQueueX_; + AscendC::TQue outQueueY_; + AscendC::TBuf tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_; + AscendC::GlobalTensor xGm_; + AscendC::GlobalTensor wGm_; + AscendC::GlobalTensor yInGm_; + AscendC::GlobalTensor yOutGm_; + AscendC::GlobalTensor loraIndicesGm_; + AscendC::GlobalTensor seqLenGm_; + uint32_t batchSize_; + uint32_t numTokensPerCore_; + uint32_t maxLoRARank_; + uint32_t outputHiddenDim_; + uint32_t sliceOffset_; + uint32_t outputFullDim_; + uint32_t singleLoRAWeightLen_; + int64_t reqLoRAIndex_; + uint64_t reqLoRAWeightOffset_; + uint32_t numOutputElementsPerInputTile_; + uint32_t numStreamInPerOutputTile_; + uint64_t yOffset_; + + // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously. + // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat, + // reads next 8 consecutive blocks in the second repeat. + AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4}; + + // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block, + // so we set dstRepStride = 1 + AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8}; + + // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks. + // For xDup we repeatedly use it, so we set src0RepStride = 0 + AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8}; + +}; + +#define SGMV_EXPAND_TYPE_DECLARE(TYPE) \ + extern "C" __global__ __aicore__ void sgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, \ + __gm__ void* loraIndices, __gm__ void* seqLen, \ + __gm__ void* yIn, __gm__ void* yOut, \ + uint32_t batchSize, uint32_t numTokensPerCore, \ + uint32_t maxLoRARank, uint32_t outputHiddenDim, \ + uint32_t sliceOffset, uint32_t outputFullDim) \ + { \ + AscendC::TPipe pipe; \ + SGMVExpand op(&pipe); \ + op.Init(x, weight, loraIndices, seqLen, yIn, yOut, batchSize, numTokensPerCore, maxLoRARank, \ + outputHiddenDim, sliceOffset, outputFullDim); \ + op.Process(); \ + } + +// declare all dtype kernel +SGMV_EXPAND_TYPE_DECLARE(half) +#if (__CCE_AICORE__ >= 220) + SGMV_EXPAND_TYPE_DECLARE(bfloat16_t) +#endif + +namespace vllm_ascend { +extern void sgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, void* loraIndices, void* seqLen, + void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank, + uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim) +{ + uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore; + if (type == AscendType::FP16) { + sgmv_expand_half<<>>(x, weight, loraIndices, seqLen, yIn, yOut, batchSize, + numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset, + outputFullDim); + } else if (type == AscendType::BF16) { + #if (__CCE_AICORE__ >= 220) + sgmv_expand_bfloat16_t<<>>(x, weight, loraIndices, seqLen, yIn, yOut, batchSize, + numTokensPerCore, maxLoRARank, outputHiddenDim, + sliceOffset, outputFullDim); + #endif + } else { + return; + } +} + +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/kernels/sgmv_shrink.cpp b/csrc/kernels/sgmv_shrink.cpp new file mode 100644 index 0000000..357fb68 --- /dev/null +++ b/csrc/kernels/sgmv_shrink.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_operator.h" +#include "types.h" + +template +class SGMVShrink { +public: + using X_T = scalar_t; + using W_T = scalar_t; + using Y_T = float; + + static constexpr uint64_t BUFFER_NUM = 1; + static constexpr uint64_t TILE_LENGTH = 11776; // optimal performance tile length + +public: + __aicore__ inline SGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {} + __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, __gm__ void *seqLen, + __gm__ void *y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim, + uint32_t maxLoRARank, float scale) + { + batchSize_ = batchSize; + numTokensPerCore_ = numTokensPerCore; + inputHiddenDim_ = inputHiddenDim; + maxLoRARank_ = maxLoRARank; + scale_ = scale; + singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_; + incremental_ = inputHiddenDim_ > TILE_LENGTH; + + xGm_.SetGlobalBuffer((__gm__ X_T *)x); + yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y); + wGm_.SetGlobalBuffer((__gm__ W_T *)weight); + loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices); + seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen); + + pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T)); + pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T)); + pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float)); + pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float)); + + pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T)); + pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float)); + } + + __aicore__ inline void Process() + { + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t startIdx = blockIdx * numTokensPerCore_; + int64_t endIdx = startIdx + numTokensPerCore_; + if (endIdx > batchSize_) { + endIdx = batchSize_; + } + for (int64_t idx = startIdx; idx < endIdx; idx++) { + // set up LoRA index + CopyInIndex(idx); + if (reqLoRAIndex_ < 0) { + continue; + } + reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_; + + if (incremental_) { + ProcessImpl(idx); + } else { + ProcessImpl(idx); + } + + ScaleOutput(); + CopyOut(idx); + } + } + +private: + template + __aicore__ inline void ProcessImpl(const int64_t idx) + { + AscendC::LocalTensor yOutLocal = outBufferY_.Get(); + if constexpr (!INCREMENTAL_MODE) { + CopyInX(idx, 0, inputHiddenDim_); + AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); + AscendC::LocalTensor xLocal = inQueueX_.DeQue(); + Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_); + pipe_barrier(PIPE_V); + inQueueX_.FreeTensor(xLocal); + } + + for (int i = 0; i < maxLoRARank_; i++) { + float acc(0); + for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) { + if constexpr (INCREMENTAL_MODE) { + CopyInX(idx, j); + } + CopyInW(i, j); + Compute(acc); + } + CopyAndComputeLastIteration(idx, i, acc); + yOutLocal.SetValue(i, acc); + } + } + + __aicore__ inline void CopyInIndex(const int64_t idx) + { + // look up the LoRA index + int64_t weightIdx = idx; + uint64_t i = 0; + for (; i < seqLenGm_.GetSize(); i++) { + int64_t repeatValue = seqLenGm_.GetValue(i); + if (weightIdx >= repeatValue) { + weightIdx -= repeatValue; + continue; + } + break; + } + reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1; + } + + __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor xLocal = inQueueX_.AllocTensor(); + DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements); + inQueueX_.EnQue(xLocal); + } + + __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor wLocal = inQueueW_.AllocTensor(); + DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements); + inQueueW_.EnQue(wLocal); + } + + template + __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor wLocal = inQueueW_.DeQue(); + AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); + AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); + + if constexpr (INCREMENTAL_MODE) { + AscendC::LocalTensor xLocal = inQueueX_.DeQue(); + Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements); + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueX_.FreeTensor(xLocal); + inQueueW_.FreeTensor(wLocal); + } else { + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueW_.FreeTensor(wLocal); + } + // dot product of the one tile of X and W + Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements); + pipe_barrier(PIPE_V); + // reduce sum generate one number, which is the summation of all the dot product + ReduceSum(wTmpTensor, wTmpTensor, wTmpTensor, numElements); + pipe_barrier(PIPE_V); + + acc += wTmpTensor.GetValue(0); + } + + template + __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc) + { + int32_t colIdx = inputHiddenDim_ / TILE_LENGTH; + int32_t remaining = inputHiddenDim_ % TILE_LENGTH; + if (remaining == 0) { + return; + } + if constexpr (INCREMENTAL_MODE) { + CopyInX(idx, colIdx, remaining); + } + CopyInW(rowIdx, colIdx, remaining); + Compute(acc, remaining); + } + + __aicore__ inline void ScaleOutput() + { + AscendC::LocalTensor yLocal = outBufferY_.Get(); + AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); + + Muls(yOutLocal, yLocal, scale_, maxLoRARank_); + pipe_barrier(PIPE_V); + + outQueueY_.EnQue(yOutLocal); + } + + __aicore__ inline void CopyOut(const int64_t idx) + { + AscendC::LocalTensor yOutLocal = outQueueY_.DeQue(); + DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_); + outQueueY_.FreeTensor(yOutLocal); + } + +private: + AscendC::TPipe *pipe_; + AscendC::TQue inQueueX_, inQueueW_; + AscendC::TQue outQueueY_; + AscendC::TBuf tmpBufferX_, tmpBufferW_, outBufferY_; + AscendC::GlobalTensor xGm_; + AscendC::GlobalTensor wGm_; + AscendC::GlobalTensor loraIndicesGm_; + AscendC::GlobalTensor seqLenGm_; + AscendC::GlobalTensor yOutGm_; + uint32_t batchSize_; + uint32_t numTokensPerCore_; + uint32_t inputHiddenDim_; + uint32_t maxLoRARank_; + float scale_; + uint32_t singleLoRAWeightLen_; + int64_t reqLoRAIndex_; + uint64_t reqLoRAWeightOffset_; + bool incremental_; +}; + +#define SGMV_SHRINK_TYPE_DECLARE(TYPE) \ + extern "C" __global__ __aicore__ void sgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, \ + __gm__ void* loraIndices, __gm__ void* seqLen, \ + __gm__ void* y, uint32_t batchSize, \ + uint32_t numTokensPerCore, uint32_t inputHiddenDim, \ + uint32_t maxLoRARank, float scale) \ + { \ + AscendC::TPipe pipe; \ + SGMVShrink op(&pipe); \ + op.Init(x, weight, loraIndices, seqLen,y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale); \ + op.Process(); \ + } + +// declare all dtype kernel +SGMV_SHRINK_TYPE_DECLARE(half) +#if (__CCE_AICORE__ >= 220) + SGMV_SHRINK_TYPE_DECLARE(bfloat16_t) +#endif + +namespace vllm_ascend { +extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* loraIndices, void* seqLen, + void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim, + uint32_t maxLoRARank, float scale) +{ + uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore; + if (type == AscendType::FP16) { + sgmv_shrink_half<<>>(x, weight, loraIndices, seqLen, y, batchSize, + numTokensPerCore, inputHiddenDim, maxLoRARank, + scale); + } else if (type == AscendType::BF16) { + #if (__CCE_AICORE__ >= 220) + sgmv_shrink_bfloat16_t<<>>(x, weight, loraIndices, seqLen, y, batchSize, + numTokensPerCore, inputHiddenDim, maxLoRARank, + scale); + #endif + } else { + return; + } +} + +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/ops.h b/csrc/ops.h index fff69bc..40a9fae 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -88,4 +88,34 @@ namespace vllm_ascend { uint32_t output_hidden_dim, uint32_t slice_offset, uint32_t output_full_dim); + + extern void sgmv_shrink_impl( + AscendType type, + void *stream, + void *x, + void *weight, + void *loraIndices, + void *seqLen, + void *y, + uint32_t batch_size, + uint32_t num_tokens_per_core, + uint32_t input_hidden_dim, + uint32_t lora_rank, + float scale); + + extern void sgmv_expand_impl( + AscendType type, + void *stream, + void *x, + void *weight, + void *loraIndices, + void *seqLen, + void *y, + void *y_out, + uint32_t batch_size, + uint32_t num_tokens_per_core, + uint32_t lora_rank, + uint32_t output_hidden_dim, + uint32_t slice_offset, + uint32_t output_full_dim); } diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp index 8bdc4b5..82cea77 100644 --- a/csrc/torch_binding.cpp +++ b/csrc/torch_binding.cpp @@ -294,6 +294,87 @@ at::Tensor bgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, a cmd.Run(); return y_out; } + +void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len, + at::Tensor &y, double scale) +{ + at::ScalarType scalar_type = x.scalar_type(); + TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16"); + TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]"); + TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4, + "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]"); + TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]"); + TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out"); + void* x_ptr = x.data_ptr(); + void* weight_ptr = weight.data_ptr(); + void* lora_indices_ptr = lora_indices.data_ptr(); + void* seq_len_ptr = seq_len.data_ptr(); + void* y_ptr = y.data_ptr(); + int batch_size = x.size(0); + int input_hidden_token = x.size(1); + uint32_t lora_rank = y.size(1); + float scale_f = static_cast(scale); + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + at_npu::native::OpCommand cmd; + cmd.Name("sgmv_shrink"); + cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, seq_len_ptr, y_ptr, + batch_size, input_hidden_token, lora_rank, scale_f]() -> int { + auto dtype = get_dtype_from_torch(scalar_type); + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num; + TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0"); + sgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, seq_len_ptr, y_ptr, batch_size, + num_tokens_per_core, input_hidden_token, lora_rank, scale_f); + return 0; + }); + cmd.Run(); + return; +} + +at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len, + at::Tensor &y, int64_t slice_offset, int64_t slice_size) +{ + at::ScalarType scalar_type = y.scalar_type(); + TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16"); + TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]"); + TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4, + "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]"); + TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]"); + TORCH_CHECK(x.size(1) <= slice_size, "hidden in should be smaller than hidden out"); + TORCH_CHECK(slice_offset >= 0, "slice offset should be no smaller than 0"); + TORCH_CHECK((slice_size + slice_offset) <= y.size(1), + "slice_size + slice_offset should be smaller than the second dimension of y") + + at::Tensor y_out = y; + void* x_ptr = x.data_ptr(); + void* weight_ptr = weight.data_ptr(); + void* lora_indices_ptr = lora_indices.data_ptr(); + void* seq_len_ptr = seq_len.data_ptr(); + void* y_ptr = y.data_ptr(); + void* y_out_ptr = y_out.data_ptr(); + int batch_size = x.size(0); + int lora_rank = x.size(1); + int output_full_dim = y.size(1); + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + at_npu::native::OpCommand cmd; + cmd.Name("sgmv_expand"); + cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, seq_len_ptr, y_ptr, y_out_ptr, + batch_size, lora_rank, slice_offset, slice_size, output_full_dim]() -> int { + auto dtype = get_dtype_from_torch(scalar_type); + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num; + TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0"); + sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, seq_len_ptr, y_ptr, y_out_ptr, + batch_size, num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim); + return 0; + }); + cmd.Run(); + return y_out; +} } // namespace vllm_ascend TORCH_LIBRARY_EXPAND(_C, ops) @@ -326,6 +407,14 @@ TORCH_LIBRARY_EXPAND(_C, ops) "bgmv_expand(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y," " int slice_offset, int slice_size) -> Tensor"); ops.impl("bgmv_expand", torch::kPrivateUse1, &vllm_ascend::bgmv_expand); + + ops.def("sgmv_shrink(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y, float scale) -> ()"); + ops.impl("sgmv_shrink", torch::kPrivateUse1, &vllm_ascend::sgmv_shrink); + + ops.def( + "sgmv_expand(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y," + " int slice_offset, int slice_size) -> Tensor"); + ops.impl("sgmv_expand", torch::kPrivateUse1, &vllm_ascend::sgmv_expand); } REGISTER_EXTENSION(_C) diff --git a/csrc/torch_binding_meta.cpp b/csrc/torch_binding_meta.cpp index 1f9464c..d69254b 100644 --- a/csrc/torch_binding_meta.cpp +++ b/csrc/torch_binding_meta.cpp @@ -69,6 +69,18 @@ std::tuple get_masked_input_and_mask_meta( return {masked_input, mask}; } +at::Tensor bgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y, + int64_t slice_offset, int64_t slice_size) { + at::Tensor y_out = at::empty_like(y); + return y_out; +} + +at::Tensor sgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len, + at::Tensor &y, int64_t slice_offset, int64_t slice_size) { + at::Tensor y_out = at::empty_like(y); + return y_out; +} + } // namespace meta } // namespace vllm_ascend @@ -81,6 +93,10 @@ namespace { ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta); // Masked input and mask meta implementation ops.impl("get_masked_input_and_mask", &vllm_ascend::meta::get_masked_input_and_mask_meta); + // Bgmv expand + ops.impl("bgmv_expand", &vllm_ascend::meta::bgmv_expand_meta); + // Sgmv expand + ops.impl("sgmv_expand", &vllm_ascend::meta::sgmv_expand_meta); } } \ No newline at end of file diff --git a/vllm_ascend/lora/punica_wrapper/lora_ops.py b/vllm_ascend/lora/punica_wrapper/lora_ops.py index dd66937..a8ff21d 100644 --- a/vllm_ascend/lora/punica_wrapper/lora_ops.py +++ b/vllm_ascend/lora/punica_wrapper/lora_ops.py @@ -52,9 +52,14 @@ def bgmv_expand_slice(inputs: torch.Tensor, slice_offset: int, slice_size: int, add_inputs: bool = True): - return torch.ops._C.bgmv_expand(inputs, lora_b_weights, - lora_indices_tensor, output_tensor, - slice_offset, slice_size) + return torch.ops._C.bgmv_expand( + inputs, + lora_b_weights, + lora_indices_tensor, + output_tensor, + slice_offset, + slice_size + ) def sgmv_shrink( @@ -69,11 +74,8 @@ def sgmv_shrink( token_nums: int, scaling: float, ): - exploded_indices = torch.repeat_interleave(lora_indices_tensor, - seq_len_tensor) - - bgmv_shrink(inputs, lora_a_weights, output_tensor, exploded_indices, - scaling) + return torch.ops._C.sgmv_shrink(inputs, lora_a_weights, lora_indices_tensor, + seq_len_tensor, output_tensor, scaling) def sgmv_expand(inputs: torch.Tensor, @@ -86,11 +88,15 @@ def sgmv_expand(inputs: torch.Tensor, max_seq_length: int, token_nums: int, add_inputs: bool = False): - exploded_indices = torch.repeat_interleave(lora_indices_tensor, - seq_len_tensor) - - bgmv_expand(inputs, lora_b_weights, output_tensor, exploded_indices, - add_inputs) + return torch.ops._C.sgmv_expand( + inputs, + lora_b_weights, + lora_indices_tensor, + seq_len_tensor, + output_tensor, + 0, + output_tensor.size(1), + ) def sgmv_expand_slice(inputs: torch.Tensor, @@ -105,8 +111,12 @@ def sgmv_expand_slice(inputs: torch.Tensor, slice_offset: int, slice_size: int, add_inputs: bool = False): - exploded_indices = torch.repeat_interleave(lora_indices_tensor, - seq_len_tensor) - - bgmv_expand_slice(inputs, lora_b_weights, output_tensor, exploded_indices, - slice_offset, slice_size, add_inputs) + return torch.ops._C.sgmv_expand( + inputs, + lora_b_weights, + lora_indices_tensor, + seq_len_tensor, + output_tensor, + slice_offset, + slice_size + ) diff --git a/vllm_ascend/lora/punica_wrapper/punica_npu.py b/vllm_ascend/lora/punica_wrapper/punica_npu.py index 8f1eaf9..a85c837 100644 --- a/vllm_ascend/lora/punica_wrapper/punica_npu.py +++ b/vllm_ascend/lora/punica_wrapper/punica_npu.py @@ -22,8 +22,8 @@ from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase # inherit this class class PunicaWrapperNPU(PunicaWrapperBase): """ - PunicaWrapperNPU is designed to manage and provide metadata for the punica - kernel. The main function is to maintain the state information for + PunicaWrapperNPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for Multi-LoRA, and to provide the interface for the pytorch punica ops. """ @@ -130,7 +130,7 @@ class PunicaWrapperNPU(PunicaWrapperBase): add_inputs: bool = True, ): """ - Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` computation, which is suitable for the GEMM of lora'b. """ @@ -166,11 +166,11 @@ class PunicaWrapperNPU(PunicaWrapperBase): prefill stage, and the `_shrink_prefill` function should be called. Otherwise, it is the decode stage, and the _shrink_decode function should be called. - + Semantics: for i in range(len(lora_a_stacked)): y[i] += (x @ lora_a_stacked[i]) * scale - + Args: y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors x (torch.Tensor): Input tensor @@ -195,19 +195,19 @@ class PunicaWrapperNPU(PunicaWrapperBase): **kwargs) -> None: """ Performs GEMM and bias addition for multiple slices of lora_b. - + Semantics: for i in range(len(lora_b_stacked)): slice = output_slices[i] - y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + - lora_bias_stacked[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + lora_bias_stacked[i] offset += slice - + Args: y (torch.Tensor): Output tensor. x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight output_slices (Tuple[int, ...]): Every slice's size add_inputs (bool): Defaults to True. @@ -266,7 +266,7 @@ class PunicaWrapperNPU(PunicaWrapperBase): buffer: Optional[Tuple[torch.Tensor, ...]] = None, **kwargs) -> None: """ - Applicable to linear-related lora. + Applicable to linear-related lora. Semantics: for i in range(len(lora_a_stacked)): diff --git a/vllm_ascend/meta_registration.py b/vllm_ascend/meta_registration.py index 600b5e7..f292e61 100644 --- a/vllm_ascend/meta_registration.py +++ b/vllm_ascend/meta_registration.py @@ -80,7 +80,30 @@ def get_masked_input_and_mask_meta(input: torch.Tensor, return masked_input, mask +def bgmv_expand_meta(x: torch.Tensor, + weight: torch.Tensor, + indices: torch.Tensor, + y: torch.Tensor, + slice_offset: int, + slice_size: int): + + y_out = torch.empty_like(y) + return y_out + +def sgmv_expand_meta(x: torch.Tensor, + weight: torch.Tensor, + lora_indices: torch.Tensor, + seq_len: torch.Tensor, + y: torch.Tensor, + slice_offset: int, + slice_size: int): + + y_out = torch.empty_like(y) + return y_out + register_meta_if_necessary("_C", "rotary_embedding", rotary_embedding_meta) register_meta_if_necessary("_C", "get_masked_input_and_mask", get_masked_input_and_mask_meta) +register_meta_if_necessary("_C", "bgmv_expand", bgmv_expand_meta) +register_meta_if_necessary("_C", "sgmv_expand", sgmv_expand_meta)