[Kernel] add custom op GmmSwigluQuantWeightNzTensorList (#3804)

### What this PR does / why we need it? This PR introduces support for adding custom CANN `aclnn` ops to `vllm-ascend`, allowing users to define and use their own custom operators. Key changes include: - Building and installing custom ops into the `vllm-ascend`-specified directory - Binding the `aclnn` op interface to the `torch.ops._C_ascend` module - Enabling invocation of these ops within `vllm-ascend` This PR includes a sample custom op: `aclnnGroupedMatmulSwigluQuantWeightNzTensorList`, which is adapted from the CANN operator [`aclnnGroupedMatmulSwigluQuantWeightNZ`](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/API/aolapi/context/aclnnGroupedMatmulSwigluQuantWeightNZ.md). Its input parameters `weight` and `weight_scale` now accept `list[torch.Tensor]` (i.e., `at::TensorList`). ### Does this PR introduce _any_ user-facing change? No. - vLLM version: v0.11.2 --------- Signed-off-by: QianChenxi <chenxi.qian.cq@outlook.com>
2025-11-28 18:06:39 +08:00
parent 3199fe8350
commit 554f16ae1f
50 changed files with 6934 additions and 7 deletions
--- a/csrc/utils/inc/kernel/dropmask.h
+++ b/csrc/utils/inc/kernel/dropmask.h
@@ -0,0 +1,121 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file dropmask.h
+ * \brief
+ */
+
+#ifndef DROPMASK_H
+#define DROPMASK_H
+
+#include "util.h"
+
+using AscendC::DROPOUT_MODE_BIT_MISALIGN;
+using AscendC::DropOutShapeInfo;
+using AscendC::DropOut;
+
+struct DropMaskInfo {
+    // for compute dropout mask offset
+    // 参数按B N G S1 S2全部切分设置进行偏移计算，没有切分的轴对应的参数设置为合适的0或者原始值
+    int64_t n2G; // n2 * g
+    int64_t gSize; // g
+    int64_t s1Size; // s1
+    int64_t s2Size; // s2
+    int64_t gOutIdx; // g out index
+    int64_t bSSOffset; // boidx * s1 * s2 ===bSSOffset
+    int64_t n2OutIdx; // n out index
+    int64_t s1OutIdx; // s1 out index   ===s1oIdx
+    int64_t s1InnerIdx; // s1 inner index, 配比 ===loopIdx
+    int64_t s1BaseSize; // S1基本块大小
+    int64_t splitS1BaseSize; // s1 split size ===vec1S1BaseSize
+    int64_t s2StartIdx; // s2 start index
+    int64_t s2Idx; // s2 index =====s2LoopCount
+    int64_t s2BaseNratioSize; // s2的配比长度: s2BaseSize(S2基本块大小) * nRatio
+
+    // for copy in dropout mask
+    uint32_t s1CopySize;
+    uint32_t s2CopySize;
+    int64_t s2TotalSize;
+
+    // for compute dropout mask
+    uint32_t firstAxis;
+    uint32_t lstAxis;
+    uint32_t maskLstAxis;
+    int64_t vecCoreOffset = 0;
+    float keepProb;
+
+    bool boolMode;
+};
+
+template <bool hasDrop>
+__aicore__ inline int64_t ComputeDropOffset(DropMaskInfo &dropMaskInfo)
+{
+    if constexpr (hasDrop == true) {
+        // boidx * n2 * g* s1 * s2
+        int64_t bOffset = dropMaskInfo.bSSOffset * dropMaskInfo.n2G;
+        // n2oIdx * g * s1 *s2
+        int64_t n2Offset = dropMaskInfo.n2OutIdx * dropMaskInfo.gSize * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
+        // goIdx * s1 * s2
+        int64_t gOffset = dropMaskInfo.gOutIdx * dropMaskInfo.s1Size * dropMaskInfo.s2Size;
+        // s1oIdx * s1BaseSize * s2Size + s1innerindex * vec1S1BaseSize * s2Size
+        int64_t s1Offset = (dropMaskInfo.s1OutIdx * dropMaskInfo.s1BaseSize + dropMaskInfo.vecCoreOffset +
+                            dropMaskInfo.s1InnerIdx * dropMaskInfo.splitS1BaseSize) * dropMaskInfo.s2Size;
+        // s2StartIdx + s2index * s2BaseNratioSize
+        int64_t s2Offset = dropMaskInfo.s2StartIdx + dropMaskInfo.s2Idx * dropMaskInfo.s2BaseNratioSize;
+        return bOffset + n2Offset + gOffset + s1Offset + s2Offset;
+    } else {
+        return 0;
+    }
+}
+
+template <bool hasDrop>
+__aicore__ inline void CopyInDropMask(LocalTensor<uint8_t>&dstTensor, GlobalTensor<uint8_t>& srcBoolTensor,
+    GlobalTensor<uint8_t>& srcByteTensor, DropMaskInfo &dropMaskInfo, int64_t alignedSize = blockBytes)
+{
+    if constexpr (hasDrop == true) {
+        int64_t dropMaskOffset = ComputeDropOffset<hasDrop>(dropMaskInfo);
+        if (unlikely(dropMaskInfo.boolMode)) {
+            BoolCopyIn(dstTensor, srcBoolTensor, dropMaskOffset,
+                       dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
+        } else {
+            Bit2Int8CopyIn(dstTensor, srcByteTensor, dropMaskOffset, 1,
+                           dropMaskInfo.s1CopySize, dropMaskInfo.s2CopySize, dropMaskInfo.s2TotalSize, alignedSize);
+        }
+        return;
+    }
+}
+
+template <typename T, bool hasDrop>
+__aicore__ inline void ComputeDropMask(LocalTensor<T>& dstTensor, LocalTensor<T>& srcTensor,
+    LocalTensor<uint8_t>& dropoutBuffer, LocalTensor<uint8_t>& tmpDropBuffer, DropMaskInfo &dropMaskInfo)
+{
+    if constexpr (hasDrop == true) {
+        DropOutShapeInfo dropOutShapeInfo;
+        dropOutShapeInfo.firstAxis = dropMaskInfo.firstAxis;
+        dropOutShapeInfo.srcLastAxis = dropMaskInfo.lstAxis;
+
+        if (unlikely(dropMaskInfo.boolMode)) {
+            dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis, blockBytes) * blockBytes;
+            DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
+        } else {
+            dropOutShapeInfo.maskLastAxis = CeilDiv(dropMaskInfo.maskLstAxis / byteBitRatio, blockBytes) * blockBytes;
+            if (likely(dropMaskInfo.lstAxis / byteBitRatio % blockBytes == 0)) {
+                DropOut(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer, dropMaskInfo.keepProb, dropOutShapeInfo);
+            } else {
+                DropOut<T, false, DROPOUT_MODE_BIT_MISALIGN>(dstTensor, srcTensor, dropoutBuffer, tmpDropBuffer,
+                                                             dropMaskInfo.keepProb, dropOutShapeInfo);
+            }
+        }
+        return;
+    }
+}
+
+#endif // DROPMASK_H