### What this PR does / why we need it? This PR introduces support for adding custom CANN `aclnn` ops to `vllm-ascend`, allowing users to define and use their own custom operators. Key changes include: - Building and installing custom ops into the `vllm-ascend`-specified directory - Binding the `aclnn` op interface to the `torch.ops._C_ascend` module - Enabling invocation of these ops within `vllm-ascend` This PR includes a sample custom op: `aclnnGroupedMatmulSwigluQuantWeightNzTensorList`, which is adapted from the CANN operator [`aclnnGroupedMatmulSwigluQuantWeightNZ`](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/API/aolapi/context/aclnnGroupedMatmulSwigluQuantWeightNZ.md). Its input parameters `weight` and `weight_scale` now accept `list[torch.Tensor]` (i.e., `at::TensorList`). ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.11.2 --------- Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
122 lines
5.0 KiB
C++
122 lines
5.0 KiB
C++
/**
|
||
* Copyright (c) 2024 Huawei Technologies Co., Ltd.
|
||
* This file is a part of the CANN Open Software.
|
||
* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
|
||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||
* See LICENSE in the root of the software repository for the full text of the License.
|
||
*/
|
||
|
||
/*!
|
||
* \file dropmask.h
|
||
* \brief
|
||
*/
|
||
|
||
#ifndef DROPMASK_H
|
||
#define DROPMASK_H
|
||
|
||
#include "util.h"
|
||
|
||
using AscendC::DROPOUT_MODE_BIT_MISALIGN;
|
||
using AscendC::DropOutShapeInfo;
|
||
using AscendC::DropOut;
|
||
|
||
struct DropMaskInfo {
|
||
// for compute dropout mask offset
|
||
// 参数按B N G S1 S2全部切分设置进行偏移计算,没有切分的轴对应的参数设置为合适的0或者原始值
|
||
int64_t n2G; // n2 * g
|
||
int64_t gSize; // g
|
||
int64_t s1Size; // s1
|
||
int64_t s2Size; // s2
|
||
int64_t gOutIdx; // g out index
|
||
int64_t bSSOffset; // boidx * s1 * s2 ===bSSOffset
|
||
int64_t n2OutIdx; // n out index
|
||
int64_t s1OutIdx; // s1 out index ===s1oIdx
|
||
int64_t s1InnerIdx; // s1 inner index, 配比 ===loopIdx
|
||
int64_t s1BaseSize; // S1基本块大小
|
||
int64_t splitS1BaseSize; // s1 split size ===vec1S1BaseSize
|
||
int64_t s2StartIdx; // s2 start index
|
||
int64_t s2Idx; // s2 index =====s2LoopCount
|
||
int64_t s2BaseNratioSize; // s2的配比长度: s2BaseSize(S2基本块大小) * nRatio
|
||
|
||
// for copy in dropout mask
|
||
uint32_t s1CopySize;
|
||
uint32_t s2CopySize;
|
||
int64_t s2TotalSize;
|
||
|
||
// for compute dropout mask
|
||
uint32_t firstAxis;
|
||
uint32_t lstAxis;
|
||
uint32_t maskLstAxis;
|
||
int64_t vecCoreOffset = 0;
|
||
float keepProb;
|
||
|
||
bool boolMode;
|
||
};
|
||
|
||
template <bool hasDrop>
|
||
__aicore__ inline int64_t ComputeDropOffset(DropMaskInfo &dropMaskInfo)
|
||
{
|
||
if constexpr (hasDrop == true) {
|
||
// boidx * n2 * g* s1 * s2
|
||
int64_t bOffset = dropMaskInfo.bSSOffset * dropMaskInfo.n2G;
|
||
// n2oIdx * g * s1 *s2
|
||
int64_t n2Offset = dropMaskInfo.n2OutIdx * dropMaskInfo.gSize * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
|
||
// goIdx * s1 * s2
|
||
int64_t gOffset = dropMaskInfo.gOutIdx * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
|
||
// s1oIdx * s1BaseSize * s2Size + s1innerindex * vec1S1BaseSize * s2Size
|
||
int64_t s1Offset = (dropMaskInfo.s1OutIdx * dropMaskInfo.s1BaseSize + dropMaskInfo.vecCoreOffset +
|
||
dropMaskInfo.s1InnerIdx * dropMaskInfo.splitS1BaseSize) * dropMaskInfo.s2Size;
|
||
// s2StartIdx + s2index * s2BaseNratioSize
|
||
int64_t s2Offset = dropMaskInfo.s2StartIdx + dropMaskInfo.s2Idx * dropMaskInfo.s2BaseNratioSize;
|
||
return bOffset + n2Offset + gOffset + s1Offset + s2Offset;
|
||
} else {
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
template <bool hasDrop>
|
||
__aicore__ inline void CopyInDropMask(LocalTensor<uint8_t>&dstTensor, GlobalTensor<uint8_t>& srcBoolTensor,
|
||
GlobalTensor<uint8_t>& srcByteTensor, DropMaskInfo &dropMaskInfo, int64_t alignedSize = blockBytes)
|
||
{
|
||
if constexpr (hasDrop == true) {
|
||
int64_t dropMaskOffset = ComputeDropOffset<hasDrop>(dropMaskInfo);
|
||
if (unlikely(dropMaskInfo.boolMode)) {
|
||
BoolCopyIn(dstTensor, srcBoolTensor, dropMaskOffset,
|
||
dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
|
||
} else {
|
||
Bit2Int8CopyIn(dstTensor, srcByteTensor, dropMaskOffset, 1,
|
||
dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
|
||
}
|
||
return;
|
||
}
|
||
}
|
||
|
||
template <typename T, bool hasDrop>
|
||
__aicore__ inline void ComputeDropMask(LocalTensor<T>& dstTensor, LocalTensor<T>& srcTensor,
|
||
LocalTensor<uint8_t>& dropoutBuffer, LocalTensor<uint8_t>& tmpDropBuffer, DropMaskInfo &dropMaskInfo)
|
||
{
|
||
if constexpr (hasDrop == true) {
|
||
DropOutShapeInfo dropOutShapeInfo;
|
||
dropOutShapeInfo.firstAxis = dropMaskInfo.firstAxis;
|
||
dropOutShapeInfo.srcLastAxis = dropMaskInfo.lstAxis;
|
||
|
||
if (unlikely(dropMaskInfo.boolMode)) {
|
||
dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis, blockBytes) * blockBytes;
|
||
DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
|
||
} else {
|
||
dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis / byteBitRatio, blockBytes) * blockBytes;
|
||
if (likely(dropMaskInfo.lstAxis / byteBitRatio % blockBytes == 0)) {
|
||
DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
|
||
} else {
|
||
DropOut<T, false, DROPOUT_MODE_BIT_MISALIGN>(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer,
|
||
dropMaskInfo.keepProb, dropOutShapeInfo);
|
||
}
|
||
}
|
||
return;
|
||
}
|
||
}
|
||
|
||
#endif // DROPMASK_H
|