[Kernel] add custom op GmmSwigluQuantWeightNzTensorList (#3804)
### What this PR does / why we need it? This PR introduces support for adding custom CANN `aclnn` ops to `vllm-ascend`, allowing users to define and use their own custom operators. Key changes include: - Building and installing custom ops into the `vllm-ascend`-specified directory - Binding the `aclnn` op interface to the `torch.ops._C_ascend` module - Enabling invocation of these ops within `vllm-ascend` This PR includes a sample custom op: `aclnnGroupedMatmulSwigluQuantWeightNzTensorList`, which is adapted from the CANN operator [`aclnnGroupedMatmulSwigluQuantWeightNZ`](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/API/aolapi/context/aclnnGroupedMatmulSwigluQuantWeightNZ.md). Its input parameters `weight` and `weight_scale` now accept `list[torch.Tensor]` (i.e., `at::TensorList`). ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.11.2 --------- Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
This commit is contained in:
121
csrc/utils/inc/kernel/dropmask.h
Normal file
121
csrc/utils/inc/kernel/dropmask.h
Normal file
@@ -0,0 +1,121 @@
|
||||
/**
|
||||
* Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file dropmask.h
|
||||
* \brief
|
||||
*/
|
||||
|
||||
#ifndef DROPMASK_H
|
||||
#define DROPMASK_H
|
||||
|
||||
#include "util.h"
|
||||
|
||||
using AscendC::DROPOUT_MODE_BIT_MISALIGN;
|
||||
using AscendC::DropOutShapeInfo;
|
||||
using AscendC::DropOut;
|
||||
|
||||
struct DropMaskInfo {
|
||||
// for compute dropout mask offset
|
||||
// 参数按B N G S1 S2全部切分设置进行偏移计算,没有切分的轴对应的参数设置为合适的0或者原始值
|
||||
int64_t n2G; // n2 * g
|
||||
int64_t gSize; // g
|
||||
int64_t s1Size; // s1
|
||||
int64_t s2Size; // s2
|
||||
int64_t gOutIdx; // g out index
|
||||
int64_t bSSOffset; // boidx * s1 * s2 ===bSSOffset
|
||||
int64_t n2OutIdx; // n out index
|
||||
int64_t s1OutIdx; // s1 out index ===s1oIdx
|
||||
int64_t s1InnerIdx; // s1 inner index, 配比 ===loopIdx
|
||||
int64_t s1BaseSize; // S1基本块大小
|
||||
int64_t splitS1BaseSize; // s1 split size ===vec1S1BaseSize
|
||||
int64_t s2StartIdx; // s2 start index
|
||||
int64_t s2Idx; // s2 index =====s2LoopCount
|
||||
int64_t s2BaseNratioSize; // s2的配比长度: s2BaseSize(S2基本块大小) * nRatio
|
||||
|
||||
// for copy in dropout mask
|
||||
uint32_t s1CopySize;
|
||||
uint32_t s2CopySize;
|
||||
int64_t s2TotalSize;
|
||||
|
||||
// for compute dropout mask
|
||||
uint32_t firstAxis;
|
||||
uint32_t lstAxis;
|
||||
uint32_t maskLstAxis;
|
||||
int64_t vecCoreOffset = 0;
|
||||
float keepProb;
|
||||
|
||||
bool boolMode;
|
||||
};
|
||||
|
||||
template <bool hasDrop>
|
||||
__aicore__ inline int64_t ComputeDropOffset(DropMaskInfo &dropMaskInfo)
|
||||
{
|
||||
if constexpr (hasDrop == true) {
|
||||
// boidx * n2 * g* s1 * s2
|
||||
int64_t bOffset = dropMaskInfo.bSSOffset * dropMaskInfo.n2G;
|
||||
// n2oIdx * g * s1 *s2
|
||||
int64_t n2Offset = dropMaskInfo.n2OutIdx * dropMaskInfo.gSize * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
|
||||
// goIdx * s1 * s2
|
||||
int64_t gOffset = dropMaskInfo.gOutIdx * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
|
||||
// s1oIdx * s1BaseSize * s2Size + s1innerindex * vec1S1BaseSize * s2Size
|
||||
int64_t s1Offset = (dropMaskInfo.s1OutIdx * dropMaskInfo.s1BaseSize + dropMaskInfo.vecCoreOffset +
|
||||
dropMaskInfo.s1InnerIdx * dropMaskInfo.splitS1BaseSize) * dropMaskInfo.s2Size;
|
||||
// s2StartIdx + s2index * s2BaseNratioSize
|
||||
int64_t s2Offset = dropMaskInfo.s2StartIdx + dropMaskInfo.s2Idx * dropMaskInfo.s2BaseNratioSize;
|
||||
return bOffset + n2Offset + gOffset + s1Offset + s2Offset;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool hasDrop>
|
||||
__aicore__ inline void CopyInDropMask(LocalTensor<uint8_t>&dstTensor, GlobalTensor<uint8_t>& srcBoolTensor,
|
||||
GlobalTensor<uint8_t>& srcByteTensor, DropMaskInfo &dropMaskInfo, int64_t alignedSize = blockBytes)
|
||||
{
|
||||
if constexpr (hasDrop == true) {
|
||||
int64_t dropMaskOffset = ComputeDropOffset<hasDrop>(dropMaskInfo);
|
||||
if (unlikely(dropMaskInfo.boolMode)) {
|
||||
BoolCopyIn(dstTensor, srcBoolTensor, dropMaskOffset,
|
||||
dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
|
||||
} else {
|
||||
Bit2Int8CopyIn(dstTensor, srcByteTensor, dropMaskOffset, 1,
|
||||
dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, bool hasDrop>
|
||||
__aicore__ inline void ComputeDropMask(LocalTensor<T>& dstTensor, LocalTensor<T>& srcTensor,
|
||||
LocalTensor<uint8_t>& dropoutBuffer, LocalTensor<uint8_t>& tmpDropBuffer, DropMaskInfo &dropMaskInfo)
|
||||
{
|
||||
if constexpr (hasDrop == true) {
|
||||
DropOutShapeInfo dropOutShapeInfo;
|
||||
dropOutShapeInfo.firstAxis = dropMaskInfo.firstAxis;
|
||||
dropOutShapeInfo.srcLastAxis = dropMaskInfo.lstAxis;
|
||||
|
||||
if (unlikely(dropMaskInfo.boolMode)) {
|
||||
dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis, blockBytes) * blockBytes;
|
||||
DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
|
||||
} else {
|
||||
dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis / byteBitRatio, blockBytes) * blockBytes;
|
||||
if (likely(dropMaskInfo.lstAxis / byteBitRatio % blockBytes == 0)) {
|
||||
DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
|
||||
} else {
|
||||
DropOut<T, false, DROPOUT_MODE_BIT_MISALIGN>(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer,
|
||||
dropMaskInfo.keepProb, dropOutShapeInfo);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // DROPMASK_H
|
||||
Reference in New Issue
Block a user