Files
xc-llm-ascend/csrc/utils/inc/kernel/dropmask.h
Chenxi Qian 554f16ae1f [Kernel] add custom op GmmSwigluQuantWeightNzTensorList (#3804)
### What this PR does / why we need it?

This PR introduces support for adding custom CANN `aclnn` ops to
`vllm-ascend`, allowing users to define and use their own custom
operators.

Key changes include:
- Building and installing custom ops into the `vllm-ascend`-specified
directory
- Binding the `aclnn` op interface to the `torch.ops._C_ascend` module
- Enabling invocation of these ops within `vllm-ascend`

This PR includes a sample custom op:
`aclnnGroupedMatmulSwigluQuantWeightNzTensorList`, which is adapted from
the CANN operator
[`aclnnGroupedMatmulSwigluQuantWeightNZ`](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/API/aolapi/context/aclnnGroupedMatmulSwigluQuantWeightNZ.md).
Its input parameters `weight` and `weight_scale` now accept
`list[torch.Tensor]` (i.e., `at::TensorList`).

### Does this PR introduce _any_ user-facing change?

No.


- vLLM version: v0.11.2

---------

Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
2025-11-28 18:06:39 +08:00

122 lines
5.0 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Copyright (c) 2024 Huawei Technologies Co., Ltd.
* This file is a part of the CANN Open Software.
* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
/*!
* \file dropmask.h
* \brief
*/
#ifndef DROPMASK_H
#define DROPMASK_H
#include "util.h"
using AscendC::DROPOUT_MODE_BIT_MISALIGN;
using AscendC::DropOutShapeInfo;
using AscendC::DropOut;
struct DropMaskInfo {
// for compute dropout mask offset
// 参数按B N G S1 S2全部切分设置进行偏移计算没有切分的轴对应的参数设置为合适的0或者原始值
int64_t n2G; // n2 * g
int64_t gSize; // g
int64_t s1Size; // s1
int64_t s2Size; // s2
int64_t gOutIdx; // g out index
int64_t bSSOffset; // boidx * s1 * s2 ===bSSOffset
int64_t n2OutIdx; // n out index
int64_t s1OutIdx; // s1 out index ===s1oIdx
int64_t s1InnerIdx; // s1 inner index, 配比 ===loopIdx
int64_t s1BaseSize; // S1基本块大小
int64_t splitS1BaseSize; // s1 split size ===vec1S1BaseSize
int64_t s2StartIdx; // s2 start index
int64_t s2Idx; // s2 index =====s2LoopCount
int64_t s2BaseNratioSize; // s2的配比长度: s2BaseSize(S2基本块大小) * nRatio
// for copy in dropout mask
uint32_t s1CopySize;
uint32_t s2CopySize;
int64_t s2TotalSize;
// for compute dropout mask
uint32_t firstAxis;
uint32_t lstAxis;
uint32_t maskLstAxis;
int64_t vecCoreOffset = 0;
float keepProb;
bool boolMode;
};
template <bool hasDrop>
__aicore__ inline int64_t ComputeDropOffset(DropMaskInfo &dropMaskInfo)
{
if constexpr (hasDrop == true) {
// boidx * n2 * g* s1 * s2
int64_t bOffset = dropMaskInfo.bSSOffset * dropMaskInfo.n2G;
// n2oIdx * g * s1 *s2
int64_t n2Offset = dropMaskInfo.n2OutIdx * dropMaskInfo.gSize * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
// goIdx * s1 * s2
int64_t gOffset = dropMaskInfo.gOutIdx * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
// s1oIdx * s1BaseSize * s2Size + s1innerindex * vec1S1BaseSize * s2Size
int64_t s1Offset = (dropMaskInfo.s1OutIdx * dropMaskInfo.s1BaseSize + dropMaskInfo.vecCoreOffset +
dropMaskInfo.s1InnerIdx * dropMaskInfo.splitS1BaseSize) * dropMaskInfo.s2Size;
// s2StartIdx + s2index * s2BaseNratioSize
int64_t s2Offset = dropMaskInfo.s2StartIdx + dropMaskInfo.s2Idx * dropMaskInfo.s2BaseNratioSize;
return bOffset + n2Offset + gOffset + s1Offset + s2Offset;
} else {
return 0;
}
}
template <bool hasDrop>
__aicore__ inline void CopyInDropMask(LocalTensor<uint8_t>&dstTensor, GlobalTensor<uint8_t>& srcBoolTensor,
GlobalTensor<uint8_t>& srcByteTensor, DropMaskInfo &dropMaskInfo, int64_t alignedSize = blockBytes)
{
if constexpr (hasDrop == true) {
int64_t dropMaskOffset = ComputeDropOffset<hasDrop>(dropMaskInfo);
if (unlikely(dropMaskInfo.boolMode)) {
BoolCopyIn(dstTensor, srcBoolTensor, dropMaskOffset,
dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
} else {
Bit2Int8CopyIn(dstTensor, srcByteTensor, dropMaskOffset, 1,
dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
}
return;
}
}
template <typename T, bool hasDrop>
__aicore__ inline void ComputeDropMask(LocalTensor<T>& dstTensor, LocalTensor<T>& srcTensor,
LocalTensor<uint8_t>& dropoutBuffer, LocalTensor<uint8_t>& tmpDropBuffer, DropMaskInfo &dropMaskInfo)
{
if constexpr (hasDrop == true) {
DropOutShapeInfo dropOutShapeInfo;
dropOutShapeInfo.firstAxis = dropMaskInfo.firstAxis;
dropOutShapeInfo.srcLastAxis = dropMaskInfo.lstAxis;
if (unlikely(dropMaskInfo.boolMode)) {
dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis, blockBytes) * blockBytes;
DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
} else {
dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis / byteBitRatio, blockBytes) * blockBytes;
if (likely(dropMaskInfo.lstAxis / byteBitRatio % blockBytes == 0)) {
DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
} else {
DropOut<T, false, DROPOUT_MODE_BIT_MISALIGN>(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer,
dropMaskInfo.keepProb, dropOutShapeInfo);
}
}
return;
}
}
#endif // DROPMASK_H