[Kernel] add custom op MatmulAllreduceAddRmsnorm (#4606)

What this PR does / why we need it?
Optimization of the fused operator for Qwen3 32B: Matmul, AllReduce,
Add, and RMSNorm

Does this PR introduce _any_ user-facing change?
No

How was this patch tested?

vLLM version: v0.11.2
vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2

Signed-off-by: tongrunze <t00574058@china.huawei.com>
Co-authored-by: tongrunze <t00574058@china.huawei.com>
This commit is contained in:
Trunrain
2025-12-10 09:05:33 +08:00
committed by GitHub
parent f404c9af7f
commit ba9cda9dfd
16 changed files with 2854 additions and 1 deletions

View File

@@ -11,7 +11,20 @@ if [[ "$SOC_VERSION" =~ ^ascend310 ]]; then
exit 0
elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
# ASCEND910B (A2) series
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention"
# depdendency: catlass
git config --global --add safe.directory "$ROOT_DIR"
CATLASS_PATH=${ROOT_DIR}/csrc/third_party/catlass/include
if [[ ! -d "${CATLASS_PATH}" ]]; then
echo "depdendency catlass is missing, try to fetch it..."
if ! git submodule update --init --recursive; then
echo "fetch failed"
exit 1
fi
fi
ABSOLUTE_CATLASS_PATH=$(cd "${CATLASS_PATH}" && pwd)
export CPATH=${ABSOLUTE_CATLASS_PATH}:${CPATH}
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm"
SOC_ARG="ascend910b"
elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
# ASCEND910C (A3) series