[OP] add custom op aclnnMoeInitRoutingCustom (#5251)
<!-- Thanks for sending a pull request! BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html --> ### What this PR does / why we need it? <!-- - Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue. If possible, please consider writing useful notes for better and faster reviews in your PR. - Please clarify why the changes are needed. For instance, the use case and bug description. - Fixes # --> This pull request introduces a new custom operator `aclnnMoeInitRoutingCustom` for Mixture-of-Experts models. It can be replaced by `aclnnMoeInitRoutingV3` once CANN 8.5 becomes available. ### Does this PR introduce _any_ user-facing change? <!-- Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes. Documentation-only updates are not considered user-facing changes. --> No. ### How was this patch tested? <!-- CI passed with new added/existing test. If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future. If tests were not added, please describe why they were not added and/or why it was difficult to add. --> --------- Signed-off-by: jiazhengyi <jiazhengyi@huawei.com> Signed-off-by: Chenxi Qian <chenxi.qian.cq@outlook.com> Co-authored-by: jiazhengyi <jiazhengyi@huawei.com> Co-authored-by: Chenxi Qian <chenxi.qian.cq@outlook.com>
This commit is contained in:
@@ -24,7 +24,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
|
||||
ABSOLUTE_CATLASS_PATH=$(cd "${CATLASS_PATH}" && pwd)
|
||||
export CPATH=${ABSOLUTE_CATLASS_PATH}:${CPATH}
|
||||
|
||||
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm"
|
||||
CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm;moe_init_routing_custom"
|
||||
SOC_ARG="ascend910b"
|
||||
elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
|
||||
# ASCEND910C (A3) series
|
||||
@@ -69,6 +69,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
|
||||
"moe_dispatch_normal"
|
||||
"dispatch_layout"
|
||||
"notify_dispatch"
|
||||
"moe_init_routing_custom"
|
||||
)
|
||||
CUSTOM_OPS=$(IFS=';'; echo "${CUSTOM_OPS_ARRAY[*]}")
|
||||
SOC_ARG="ascend910_93"
|
||||
|
||||
55
csrc/moe_init_routing_custom/op_host/CMakeLists.txt
Normal file
55
csrc/moe_init_routing_custom/op_host/CMakeLists.txt
Normal file
@@ -0,0 +1,55 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
# This file is a part of the CANN Open Software.
|
||||
# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
|
||||
# Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# See LICENSE in the root of the software repository for the full text of the License.
|
||||
# ======================================================================================================================
|
||||
|
||||
add_ops_compile_options(
|
||||
OP_NAME MoeInitRoutingCustom
|
||||
OPTIONS --cce-auto-sync=on
|
||||
-Wno-deprecated-declarations
|
||||
-Werror
|
||||
)
|
||||
|
||||
target_sources(op_host_aclnnExc PRIVATE
|
||||
moe_init_routing_custom_def.cpp
|
||||
)
|
||||
|
||||
target_sources(opapi PRIVATE
|
||||
moe_init_routing_custom.cpp
|
||||
aclnn_moe_init_routing_custom.cpp
|
||||
)
|
||||
|
||||
if (NOT BUILD_OPEN_PROJECT)
|
||||
target_sources(aclnn_ops_train PRIVATE
|
||||
moe_init_routing_custom.cpp
|
||||
aclnn_moe_init_routing_custom.cpp
|
||||
)
|
||||
|
||||
target_sources(aclnn_ops_infer PRIVATE
|
||||
moe_init_routing_custom.cpp
|
||||
aclnn_moe_init_routing_custom.cpp
|
||||
)
|
||||
endif ()
|
||||
|
||||
target_sources(optiling PRIVATE
|
||||
moe_init_routing_custom_tiling_base.cpp
|
||||
moe_init_routing_custom_tiling.cpp
|
||||
)
|
||||
|
||||
target_include_directories(optiling PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
|
||||
target_sources(opsproto PRIVATE
|
||||
moe_init_routing_custom_infershape.cpp
|
||||
)
|
||||
|
||||
file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_moe_init_routing_custom.h")
|
||||
|
||||
install(FILES ${_GMM_Aclnn_header}
|
||||
DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
|
||||
)
|
||||
@@ -0,0 +1,143 @@
|
||||
/**
|
||||
* This program is free software, you can redistribute it and/or modify.
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <tuple>
|
||||
#include <cstddef>
|
||||
#include "opdev/make_op_executor.h"
|
||||
#include "aclnn_kernels/contiguous.h"
|
||||
#include "opdev/tensor_view_utils.h"
|
||||
#include "aclnn_kernels/common/op_error_check.h"
|
||||
#include "opdev/op_log.h"
|
||||
#include "aclnn_kernels/cast.h"
|
||||
#include "opdev/common_types.h"
|
||||
#include "moe_init_routing_custom.h"
|
||||
#include "aclnn_moe_init_routing_custom.h"
|
||||
|
||||
using namespace op;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
static const int64_t MOE_DIM_2 = 2;
|
||||
static const int64_t MOE_DIM_1 = 1;
|
||||
}
|
||||
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_X= {DataType::DT_FLOAT16, DataType::DT_BF16, DataType::DT_FLOAT, DataType::DT_INT8};
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPERT_IDX = {DataType::DT_INT32};
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_SCALE = {DataType::DT_FLOAT};
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_OFFSET= {DataType::DT_FLOAT};
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_X_OUT = {DataType::DT_FLOAT16, DataType::DT_BF16, DataType::DT_FLOAT, DataType::DT_INT8};
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_ROW_IDX_OUT = {DataType::DT_INT32};
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPERT_TOKENS_COUNT_OR_CUMSUMOUT = {DataType::DT_INT64};
|
||||
static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_SCALE_OUT = {DataType::DT_FLOAT};
|
||||
|
||||
static inline bool CheckNotNull(const aclTensor *x,
|
||||
const aclTensor *expertIdx,
|
||||
const aclTensor *expandedXOut,
|
||||
const aclTensor *expandedRowIdxOut,
|
||||
const aclTensor *expertTokensCountOrCumsumOut,
|
||||
const aclTensor *expandedScaleOut) {
|
||||
OP_CHECK_NULL(x, return false);
|
||||
OP_CHECK_NULL(expertIdx, return false);
|
||||
OP_CHECK_NULL(expandedXOut, return false);
|
||||
OP_CHECK_NULL(expandedRowIdxOut, return false);
|
||||
OP_CHECK_NULL(expertTokensCountOrCumsumOut, return false);
|
||||
OP_CHECK_NULL(expandedScaleOut, return false);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
aclnnStatus aclnnMoeInitRoutingCustomGetWorkspaceSize(const aclTensor *x,
|
||||
const aclTensor *expertIdx,
|
||||
const aclTensor *scaleOptional,
|
||||
const aclTensor *offsetOptional,
|
||||
int64_t activeNum,
|
||||
int64_t expertCapacity,
|
||||
int64_t expertNum,
|
||||
int64_t dropPadMode,
|
||||
int64_t expertTokensNumType,
|
||||
bool expertTokensNumFlag,
|
||||
int64_t quantMode,
|
||||
const aclIntArray *activeExpertRangeOptional,
|
||||
int64_t rowIdxType,
|
||||
const aclTensor *expandedXOut,
|
||||
const aclTensor *expandedRowIdxOut,
|
||||
const aclTensor *expertTokensCountOrCumsumOut,
|
||||
const aclTensor *expandedScaleOut,
|
||||
uint64_t *workspaceSize,
|
||||
aclOpExecutor **executor)
|
||||
{
|
||||
L2_DFX_PHASE_1(aclnnMoeInitRoutingCustom,
|
||||
DFX_IN(x, expertIdx, scaleOptional, offsetOptional,
|
||||
activeNum, expertCapacity, expertNum, dropPadMode,
|
||||
expertTokensNumType, expertTokensNumFlag, quantMode, activeExpertRangeOptional, rowIdxType),
|
||||
DFX_OUT(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut));
|
||||
auto ret = CheckNotNull(x, expertIdx, expandedXOut, expandedRowIdxOut,
|
||||
expertTokensCountOrCumsumOut, expandedScaleOut);
|
||||
|
||||
CHECK_RET(ret, ACLNN_ERR_PARAM_NULLPTR);
|
||||
|
||||
auto uniqueExecutor = CREATE_EXECUTOR();
|
||||
CHECK_RET(uniqueExecutor.get() != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
|
||||
|
||||
auto xContiguous = l0op::Contiguous(x, uniqueExecutor.get());
|
||||
CHECK_RET(xContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
|
||||
auto expertIdxContiguous = l0op::Contiguous(expertIdx, uniqueExecutor.get());
|
||||
CHECK_RET(expertIdxContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
|
||||
|
||||
const aclTensor* scaleContiguous = nullptr;
|
||||
const aclTensor* offsetContiguous = nullptr;
|
||||
if (scaleOptional != nullptr) {
|
||||
scaleContiguous = l0op::Contiguous(scaleOptional, uniqueExecutor.get());
|
||||
CHECK_RET(scaleContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
|
||||
}
|
||||
|
||||
if (offsetOptional != nullptr) {
|
||||
offsetContiguous = l0op::Contiguous(offsetOptional, uniqueExecutor.get());
|
||||
CHECK_RET(offsetContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
|
||||
}
|
||||
|
||||
auto routingResult = std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(nullptr, nullptr, nullptr, nullptr);
|
||||
routingResult = l0op::MoeInitRoutingCustom(xContiguous, expertIdxContiguous, scaleContiguous, offsetContiguous,
|
||||
activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag,
|
||||
quantMode, activeExpertRangeOptional, rowIdxType, expandedXOut, expandedRowIdxOut,
|
||||
expertTokensCountOrCumsumOut, expandedScaleOut, uniqueExecutor.get());
|
||||
auto [expandedXOut_, expandedRowIdxOut_, expertTokensCountOrCumsumOut_, expandedScaleOut_] = routingResult;
|
||||
bool hasNullptr = (expandedXOut_ == nullptr) || (expandedRowIdxOut_ == nullptr) || (expertTokensCountOrCumsumOut_ == nullptr) || (expandedScaleOut_ == nullptr);
|
||||
CHECK_RET(hasNullptr != true, ACLNN_ERR_INNER_NULLPTR);
|
||||
|
||||
auto viewCopyExpandedXOutResult = l0op::ViewCopy(expandedXOut_, expandedXOut, uniqueExecutor.get());
|
||||
CHECK_RET(viewCopyExpandedXOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
|
||||
auto viewCopyExpandedRowIdxOutResult = l0op::ViewCopy(expandedRowIdxOut_, expandedRowIdxOut, uniqueExecutor.get());
|
||||
CHECK_RET(viewCopyExpandedRowIdxOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
|
||||
|
||||
auto viewCopyExpertTokensCountOrCumsumOutResult = l0op::ViewCopy(expertTokensCountOrCumsumOut_, expertTokensCountOrCumsumOut, uniqueExecutor.get());
|
||||
CHECK_RET(viewCopyExpertTokensCountOrCumsumOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
|
||||
|
||||
auto viewCopyExpandedScaleOutResult = l0op::ViewCopy(expandedScaleOut_, expandedScaleOut, uniqueExecutor.get());
|
||||
CHECK_RET(viewCopyExpandedScaleOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
|
||||
|
||||
*workspaceSize = uniqueExecutor->GetWorkspaceSize();
|
||||
uniqueExecutor.ReleaseTo(executor);
|
||||
return ACLNN_SUCCESS;
|
||||
}
|
||||
aclnnStatus aclnnMoeInitRoutingCustom(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
|
||||
aclrtStream stream)
|
||||
{
|
||||
L2_DFX_PHASE_2(aclnnMoeInitRoutingCustom);
|
||||
return CommonOpExecutorRun(workspace, workspaceSize, executor, stream);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,47 @@
|
||||
/**
|
||||
* This program is free software, you can redistribute it and/or modify.
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
#ifndef OP_API_INC_MOE_INIT_ROUTING_CUSTOM_H_
|
||||
#define OP_API_INC_MOE_INIT_ROUTING_CUSTOM_H_
|
||||
|
||||
#include "aclnn/aclnn_base.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
__attribute__((visibility("default"))) aclnnStatus aclnnMoeInitRoutingCustomGetWorkspaceSize(const aclTensor *x,
|
||||
const aclTensor *expertIdx,
|
||||
const aclTensor *scaleOptional,
|
||||
const aclTensor *offsetOptional,
|
||||
int64_t activeNum,
|
||||
int64_t expertCapacity,
|
||||
int64_t expertNum,
|
||||
int64_t dropPadMode,
|
||||
int64_t expertTokensNumType,
|
||||
bool expertTokensNumFlag,
|
||||
int64_t quantMode,
|
||||
const aclIntArray *activeExpertRangeOptional,
|
||||
int64_t rowIdxType,
|
||||
const aclTensor *expandedXOut,
|
||||
const aclTensor *expandedRowIdxOut,
|
||||
const aclTensor *expertTokensCountOrCumsumOut,
|
||||
const aclTensor *expandedScaleOut,
|
||||
uint64_t *workspaceSize,
|
||||
aclOpExecutor **executor);
|
||||
|
||||
__attribute__((visibility("default"))) aclnnStatus aclnnMoeInitRoutingCustom(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
|
||||
aclrtStream stream);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,50 @@
|
||||
/**
|
||||
* This program is free software, you can redistribute it and/or modify.
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
#include <tuple>
|
||||
#include "moe_init_routing_custom.h"
|
||||
#include "opdev/make_op_executor.h"
|
||||
#include "opdev/op_def.h"
|
||||
#include "opdev/op_dfx.h"
|
||||
#include "opdev/op_executor.h"
|
||||
#include "opdev/op_log.h"
|
||||
#include "opdev/shape_utils.h"
|
||||
#include "aclnn_kernels/common/op_error_check.h"
|
||||
|
||||
using namespace op;
|
||||
|
||||
namespace l0op {
|
||||
OP_TYPE_REGISTER(MoeInitRoutingCustom);
|
||||
|
||||
std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*> MoeInitRoutingCustom(const aclTensor *x, const aclTensor *expertIdx, const aclTensor *scale,
|
||||
const aclTensor *offset, int64_t activeNum, int64_t expertCapacity,
|
||||
int64_t expertNum, int64_t dropPadMode, int64_t expertTokensNumType,
|
||||
bool expertTokensNumFlag, int64_t quantMode, const aclIntArray *activeExpertRange,
|
||||
int64_t rowIdxType, const aclTensor *expandedX, const aclTensor *expandedRowIdx,
|
||||
const aclTensor *expertTokensCountOrCumsum, const aclTensor *expandedScale, aclOpExecutor *executor)
|
||||
{
|
||||
L0_DFX(MoeInitRoutingCustom, x, expertIdx, scale, offset, activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag,
|
||||
quantMode, activeExpertRange, rowIdxType, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale);
|
||||
|
||||
auto expandedXOut = executor->AllocTensor(expandedX->GetViewShape(), expandedX->GetDataType(), Format::FORMAT_ND);
|
||||
auto expandedRowIdxOut = executor->AllocTensor(expandedRowIdx->GetViewShape(), expandedRowIdx->GetDataType(), Format::FORMAT_ND);
|
||||
auto expertTokensCountOrCumsumOut = executor->AllocTensor(expertTokensCountOrCumsum->GetViewShape(), expertTokensCountOrCumsum->GetDataType(), Format::FORMAT_ND);
|
||||
auto expandedScaleOut = executor->AllocTensor(expandedScale->GetViewShape(), expandedScale->GetDataType(), Format::FORMAT_ND);
|
||||
if (expandedXOut == nullptr || expandedRowIdxOut == nullptr || expertTokensCountOrCumsumOut == nullptr || expandedScaleOut == nullptr) {
|
||||
OP_LOGE(ACLNN_ERR_INNER_NULLPTR, "alloc expandedXOut or expandedRowIdxOut or expertTokensCountOrCumsumOut or expandedScaleOut tensor failed.");
|
||||
return std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(nullptr, nullptr, nullptr, nullptr);
|
||||
}
|
||||
|
||||
ADD_TO_LAUNCHER_LIST_AICORE(
|
||||
MoeInitRoutingCustom, OP_INPUT(x, expertIdx, scale, offset), OP_OUTPUT(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut), OP_ATTR(activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag, quantMode, activeExpertRange, rowIdxType));
|
||||
return std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut); //OP_OUTPUT
|
||||
}
|
||||
|
||||
} // namespace l0op
|
||||
@@ -0,0 +1,25 @@
|
||||
/**
|
||||
* This program is free software, you can redistribute it and/or modify.
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
#ifndef OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
|
||||
#define OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
|
||||
|
||||
#include <tuple>
|
||||
#include "opdev/op_executor.h"
|
||||
|
||||
namespace l0op {
|
||||
std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*> MoeInitRoutingCustom(const aclTensor *x, const aclTensor *expertIdx, const aclTensor *scale,
|
||||
const aclTensor *offset, int64_t activeNum, int64_t expertCapacity,
|
||||
int64_t expertNum, int64_t dropPadMode, int64_t expertTokensNumType,
|
||||
bool expertTokensNumFlag, int64_t quantMode, const aclIntArray *activeExpertRange,
|
||||
int64_t rowIdxType, const aclTensor *expandedX, const aclTensor *expandedRowIdx,
|
||||
const aclTensor *expertTokensCountOrCumsum, const aclTensor *expandedScale, aclOpExecutor *executor);
|
||||
} // namespace l0op
|
||||
#endif // OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
|
||||
@@ -0,0 +1,105 @@
|
||||
/**
|
||||
* This program is free software, you can redistribute it and/or modify.
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_init_routing_v3_def.cpp
|
||||
* \brief
|
||||
*/
|
||||
#include "register/op_def_registry.h"
|
||||
|
||||
namespace ops {
|
||||
class MoeInitRoutingCustom : public OpDef {
|
||||
public:
|
||||
explicit MoeInitRoutingCustom(const char *name) : OpDef(name)
|
||||
{
|
||||
this->Input("x")
|
||||
.ParamType(REQUIRED)
|
||||
.DataType(
|
||||
{ge::DT_INT8, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_FLOAT})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND})
|
||||
.AutoContiguous();
|
||||
this->Input("expert_idx")
|
||||
.ParamType(REQUIRED)
|
||||
.DataType(
|
||||
{ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND})
|
||||
.AutoContiguous();
|
||||
this->Input("scale")
|
||||
.ParamType(OPTIONAL)
|
||||
.DataType(
|
||||
{ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND})
|
||||
.AutoContiguous();
|
||||
this->Input("offset")
|
||||
.ParamType(OPTIONAL)
|
||||
.DataType(
|
||||
{ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND})
|
||||
.AutoContiguous();
|
||||
this->Output("expanded_x")
|
||||
.ParamType(REQUIRED)
|
||||
.DataType({ge::DT_INT8, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_FLOAT, ge::DT_INT8, ge::DT_INT8, ge::DT_INT8})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND});
|
||||
this->Output("expanded_row_idx")
|
||||
.ParamType(REQUIRED)
|
||||
.DataType(
|
||||
{ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND});
|
||||
this->Output("expert_tokens_count_or_cumsum")
|
||||
.ParamType(REQUIRED)
|
||||
.DataType(
|
||||
{ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND});
|
||||
this->Output("expanded_scale")
|
||||
.ParamType(REQUIRED)
|
||||
.DataType(
|
||||
{ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
|
||||
ge::FORMAT_ND, ge::FORMAT_ND});
|
||||
this->Attr("active_num").AttrType(OPTIONAL).Int(-1);
|
||||
this->Attr("expert_capacity").AttrType(OPTIONAL).Int(-1);
|
||||
this->Attr("expert_num").AttrType(OPTIONAL).Int(-1);
|
||||
this->Attr("drop_pad_mode").AttrType(OPTIONAL).Int(0);
|
||||
this->Attr("expert_tokens_num_type").AttrType(OPTIONAL).Int(0);
|
||||
this->Attr("expert_tokens_num_flag").AttrType(OPTIONAL).Bool(false);
|
||||
this->Attr("quant_mode").AttrType(OPTIONAL).Int(-1);
|
||||
this->Attr("active_expert_range").AttrType(OPTIONAL).ListInt({});
|
||||
this->Attr("row_idx_type").AttrType(OPTIONAL).Int(0);
|
||||
this->AICore().AddConfig("ascend910b");
|
||||
this->AICore().AddConfig("ascend910_93");
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
OP_ADD(MoeInitRoutingCustom);
|
||||
} // namespace ops
|
||||
@@ -0,0 +1,797 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/* !
|
||||
* \file moe_init_routing_custom_infershape.cpp
|
||||
* \brief
|
||||
*/
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "register/op_def_registry.h"
|
||||
#include "log/ops_log.h"
|
||||
#include "platform/platform_info.h"
|
||||
|
||||
#define unlikely(x) __builtin_expect((x), 0)
|
||||
#define OP_CHECK_NULL_WITH_CONTEXT(context, ptr) \
|
||||
do { \
|
||||
if (unlikely((ptr) == nullptr)) { \
|
||||
const char* name = (unlikely(((context) == nullptr) || (context)->GetNodeName() == nullptr)) ? \
|
||||
"nil" : \
|
||||
(context)->GetNodeName(); \
|
||||
OPS_LOG_E(name, "%s is nullptr!", #ptr); \
|
||||
return ge::GRAPH_FAILED; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
using namespace ge;
|
||||
namespace ops {
|
||||
static constexpr size_t DIM_ONE = 1U;
|
||||
static constexpr size_t DIM_TWO = 2U;
|
||||
static constexpr size_t DIM_THREE = 3U;
|
||||
static constexpr int64_t NEG_ONE = static_cast<int64_t>(-1);
|
||||
static constexpr int64_t NEG_TWO = static_cast<int64_t>(-2);
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_X = 0;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_EXPERT_IDX = 1;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_SCALE = 2;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_OFFSET = 3;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_NUM = 0;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_CAPACITY = 1;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_NUM = 2;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_DROP_PAD_MODE = 3;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_TYPE = 4;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_FLAG = 5;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_QUANT_MODE = 6;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_EXPERT_RANGE = 7;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_ROW_IDX_TYPE = 8;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X = 0;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX = 1;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT = 2;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE = 3;
|
||||
static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND = 10240;
|
||||
static constexpr int64_t KEY_VALUE_MODE_DIM0_NUM = 2;
|
||||
enum DropPadMode : int8_t {
|
||||
NO_DROP_PAD = 0,
|
||||
DROP_PAD = 1,
|
||||
};
|
||||
enum QuantMode : int8_t {
|
||||
NON_QUANT = -1,
|
||||
STATIC_QUANT = 0,
|
||||
DYNAMIC_QUANT = 1
|
||||
};
|
||||
enum ExpertTokenNumType : int8_t {
|
||||
CUMSUM = 0,
|
||||
COUNT = 1,
|
||||
KEY_VALUE = 2
|
||||
};
|
||||
|
||||
static bool isSameDim(int64_t dim1, int64_t dim2)
|
||||
{
|
||||
if (dim1 <= NEG_ONE || dim2 <= NEG_ONE) {
|
||||
return true;
|
||||
}
|
||||
return dim1 == dim2;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrActiveExpertRange(const gert::RuntimeAttrs *attrs,
|
||||
gert::InferShapeContext *context, int64_t &expertStart,
|
||||
int64_t &expertEnd, int64_t &experNum)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrActiveExpertRange.");
|
||||
// Check if active_expert_range size is 2 and if expert_start < expert_end
|
||||
auto activeExpertRangePtr = attrs->GetListInt(MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_EXPERT_RANGE);
|
||||
if (nullptr == activeExpertRangePtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The active_expert_range should be list int. But it is none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
int64_t activeExpertRangeSize = activeExpertRangePtr->GetSize();
|
||||
if (activeExpertRangePtr->GetSize() == DIM_TWO) {
|
||||
expertStart = activeExpertRangePtr->GetData()[0];
|
||||
expertEnd = activeExpertRangePtr->GetData()[1];
|
||||
if (expertStart >= expertEnd || expertStart < 0 || expertEnd > MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND) {
|
||||
OPS_LOG_E(context->GetNodeName(),
|
||||
"The active_expert_range should be in [0, %ld), but the active_expert_range is [%ld, %ld).",
|
||||
MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND, expertStart, expertEnd);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
} else if (activeExpertRangePtr->GetSize() == 0) {
|
||||
expertStart = 0;
|
||||
expertEnd = experNum;
|
||||
} else {
|
||||
OPS_LOG_E(context->GetNodeName(), "The active_expert_range size should be 2, but its size is %ld.", activeExpertRangeSize);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrActiveExpertRange.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrActiveNum(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
|
||||
int64_t &activeNum, int64_t &dropPadMode)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrActiveNum.");
|
||||
const int64_t *activeNumPtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_NUM);
|
||||
if (nullptr == activeNumPtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The active_num should not be none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
activeNum = *activeNumPtr;
|
||||
if (dropPadMode == DropPadMode::NO_DROP_PAD && activeNum < -1) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The active_num should be greater than or equal to 0. But it is %ld.", activeNum);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrActiveNum.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrExpertCapacity(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
|
||||
const gert::Shape *xShape, int64_t &expertCapacity,
|
||||
int64_t &dropPadMode)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrExpertCapacity.");
|
||||
const int64_t *expertCapacityPtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_CAPACITY);
|
||||
if (nullptr == expertCapacityPtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The expert_capacity should not be none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
expertCapacity = *expertCapacityPtr;
|
||||
if (dropPadMode == DropPadMode::DROP_PAD && xShape->GetDim(0) > 0 && expertCapacity > xShape->GetDim(0)) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The expert_capacity should be between 0 and n. But it is %ld.", expertCapacity);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertCapacity.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrExpertNum(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
|
||||
int64_t &experNum)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckexperNum.");
|
||||
const int64_t *experNumPtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_NUM);
|
||||
if (nullptr == experNumPtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The expert_num should not be none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
experNum = *experNumPtr;
|
||||
if (experNum <= 0 || experNum > MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The expert_num should be greater than 0. But it is %ld.", experNum);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertNum.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrDropPadMode(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
|
||||
int64_t &dropPadMode)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrDropPadMode.");
|
||||
const int64_t *dropPadModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_DROP_PAD_MODE);
|
||||
if (nullptr == dropPadModePtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The RuntimeAttrs for drop_pad_mode is none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
dropPadMode = *dropPadModePtr;
|
||||
if (dropPadMode < DropPadMode::NO_DROP_PAD || dropPadMode > DropPadMode::DROP_PAD) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The drop_pad_mode should be %d or %d. But it is %ld.", DropPadMode::NO_DROP_PAD,
|
||||
DropPadMode::DROP_PAD, dropPadMode);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrDropPadMode.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrExpertTokenNumType(const gert::RuntimeAttrs *attrs, gert::InferShapeContext* context,
|
||||
int64_t &experTokenNumType)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckexperTokenNumType.");
|
||||
const int64_t *experTokenNumTypePtr =
|
||||
attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_TYPE);
|
||||
if (nullptr == experTokenNumTypePtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The expert_token_num_type should not be none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
experTokenNumType = *experTokenNumTypePtr;
|
||||
if (experTokenNumType < ExpertTokenNumType::CUMSUM || experTokenNumType > ExpertTokenNumType::KEY_VALUE) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The expert_token_num_type should be %d, %d or %d. But it is %ld.",
|
||||
ExpertTokenNumType::CUMSUM, ExpertTokenNumType::COUNT, ExpertTokenNumType::KEY_VALUE,
|
||||
experTokenNumType);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertTokenNumType.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrExpertTokenNumFlag(const gert::RuntimeAttrs *attrs,
|
||||
gert::InferShapeContext *context, bool &experTokenNumFlag)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckexperTokenNumType.");
|
||||
const bool *experTokenNumFlagPtr = attrs->GetAttrPointer<bool>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_FLAG);
|
||||
if (nullptr == experTokenNumFlagPtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The expert_token_num_flag should not be none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
experTokenNumFlag = *experTokenNumFlagPtr;
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertTokenNumType.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrQuantMode(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
|
||||
int64_t &quantMode)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckQuantMode.");
|
||||
if (nullptr == attrs) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The RuntimeAttrs for quant_mode is none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
const int64_t *quantModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_QUANT_MODE);
|
||||
if (nullptr == quantModePtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The quant_mode should be %d, %d or %d. But it is none.", QuantMode::NON_QUANT,
|
||||
QuantMode::STATIC_QUANT, QuantMode::DYNAMIC_QUANT);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
quantMode = *quantModePtr;
|
||||
if (quantMode < QuantMode::NON_QUANT || quantMode > QuantMode::DYNAMIC_QUANT) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The quant_mode should be %d, %d or %d. But it is %ld.", QuantMode::NON_QUANT,
|
||||
QuantMode::STATIC_QUANT, QuantMode::DYNAMIC_QUANT, quantMode);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckQuantMode.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus GetAndCheckAttrRowIdxType(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
|
||||
int64_t &rowIdxType, int64_t &dropPadMode)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrRowIdxType.");
|
||||
if (nullptr == attrs) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The RuntimeAttrs for row_Idx_type is none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
const int64_t *dropPadModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_DROP_PAD_MODE);
|
||||
dropPadMode = *dropPadModePtr;
|
||||
|
||||
const int64_t *rowIdxTypePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_ROW_IDX_TYPE);
|
||||
if (nullptr == rowIdxTypePtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The row_Idx_type should be 0 or 1. But it is none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
rowIdxType = *rowIdxTypePtr;
|
||||
if (dropPadMode == DropPadMode::DROP_PAD && rowIdxType != 0) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The row_Idx_type should be 0 when dropPadMode is equal to 1 But it is %ld.", rowIdxType);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
if (rowIdxType < 0 || rowIdxType > 1) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The row_Idx_type should be 0 or 1 But it is %ld.", rowIdxType);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrRowIdxType.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus CheckInputScaleShape(gert::InferShapeContext *context, const gert::Shape *xShape,
|
||||
const gert::Shape *scaleShape, const int64_t expertStart,
|
||||
const int64_t expertEnd, const int64_t quantMode)
|
||||
{
|
||||
// When quant_mode is STATIC_QUANT, scale cannot be none.
|
||||
OP_CHECK((nullptr == scaleShape && QuantMode::STATIC_QUANT == quantMode),
|
||||
OPS_LOG_E(context->GetNodeName(), "The scale cannot be none when quant_mode is %ld.", quantMode),
|
||||
return ge::GRAPH_FAILED);
|
||||
|
||||
// When quant_mode is NON_QUANT or DYNAMIC_QUANT, scale can be none.
|
||||
OP_CHECK((nullptr == scaleShape && (QuantMode::NON_QUANT == quantMode || QuantMode::DYNAMIC_QUANT == quantMode)),
|
||||
OPS_LOG_I(context->GetNodeName(), "When quant_mode is NON_QUANT or DYNAMIC_QUANT, scale can be none."),
|
||||
return ge::GRAPH_SUCCESS);
|
||||
|
||||
if (QuantMode::NON_QUANT == quantMode) {
|
||||
if (scaleShape->GetDimNum() == DIM_ONE) {
|
||||
OP_CHECK(scaleShape->GetDim(0) < 0 && scaleShape->GetDim(0) != NEG_ONE && scaleShape->GetDim(0) != NEG_TWO,
|
||||
OPS_LOG_E(context->GetNodeName(),
|
||||
"When quant_mode is %ld and use scale in dynamic graph, The shape of scale should be (-1) or (-2), current shape is (%s).",
|
||||
quantMode, ops::Shape2String(*scaleShape).c_str()),
|
||||
return ge::GRAPH_FAILED);
|
||||
OP_CHECK(scaleShape->GetDim(0) > 0 && !isSameDim(scaleShape->GetDim(0), xShape->GetDim(0)),
|
||||
OPS_LOG_E(context->GetNodeName(),
|
||||
"When quant_mode is %ld and use scale in static graph, The shape of scale should be (%ld,), current shape is (%s).",
|
||||
quantMode, xShape->GetDim(0), ops::Shape2String(*scaleShape).c_str()),
|
||||
return ge::GRAPH_FAILED);
|
||||
} else {
|
||||
OPS_LOG_E(context->GetNodeName(), "When quant_mode is %ld, The dimNum of scale should be 1, current shape is (%ld).", quantMode,
|
||||
scaleShape->GetDimNum());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
} else if (QuantMode::STATIC_QUANT == quantMode) {
|
||||
if (scaleShape->GetDimNum() == DIM_ONE) {
|
||||
OP_CHECK(
|
||||
scaleShape->GetDim(0) != NEG_ONE && scaleShape->GetDim(0) != NEG_TWO &&
|
||||
!isSameDim(scaleShape->GetDim(0), DIM_ONE),
|
||||
OPS_LOG_E(
|
||||
context->GetNodeName(),
|
||||
"When quant_mode is %ld, the shape of scale should be (-1) or (-2) or (1,), current shape is (%s).",
|
||||
quantMode, ops::Shape2String(*scaleShape).c_str()),
|
||||
return ge::GRAPH_FAILED);
|
||||
} else {
|
||||
OPS_LOG_E(context->GetNodeName(), "When quant_mode is %ld, the dimNum of scale should be (1,), current shape is (%ld).",
|
||||
quantMode, scaleShape->GetDimNum());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
} else if (QuantMode::DYNAMIC_QUANT == quantMode) {
|
||||
int64_t activeExpertRange = expertEnd - expertStart;
|
||||
if (scaleShape->GetDimNum() == DIM_ONE) {
|
||||
OP_CHECK(scaleShape->GetDim(0) != NEG_TWO,
|
||||
OPS_LOG_E(context->GetNodeName(),
|
||||
"When quant_mode is %ld and scale dim is 1 in dynamic graph, the first dim of scale should be -2, but "
|
||||
"its shape is (%ld).",
|
||||
quantMode, scaleShape->GetDim(0)),
|
||||
return ge::GRAPH_FAILED);
|
||||
} else if (scaleShape->GetDimNum() == DIM_TWO) {
|
||||
if (scaleShape->GetDim(0) > 0) {
|
||||
OP_CHECK(
|
||||
!isSameDim(scaleShape->GetDim(0), activeExpertRange) && !isSameDim(scaleShape->GetDim(0), DIM_ONE),
|
||||
OPS_LOG_E(
|
||||
context->GetNodeName(),
|
||||
"When quant_mode is %ld in static graph, the first dim of scale should be 1 or %ld, but its shape is (%ld).",
|
||||
quantMode, activeExpertRange, scaleShape->GetDim(0)),
|
||||
return ge::GRAPH_FAILED);
|
||||
OP_CHECK(
|
||||
!isSameDim(scaleShape->GetDim(1), xShape->GetDim(1)),
|
||||
OPS_LOG_E(
|
||||
context->GetNodeName(),
|
||||
"When quant_mode is %ld in static graph, the second dim of scale should or %ld, but its shape is (%ld).",
|
||||
quantMode, xShape->GetDim(1), scaleShape->GetDim(0)),
|
||||
return ge::GRAPH_FAILED);
|
||||
} else {
|
||||
OP_CHECK(
|
||||
scaleShape->GetDim(0) != NEG_ONE || (scaleShape->GetDim(1) != NEG_ONE && scaleShape->GetDim(1) != xShape->GetDim(1)),
|
||||
OPS_LOG_E(context->GetNodeName(),
|
||||
"When quant_mode is %ld and scale dim is 2 in dynamic graph, the shape of scale should be (-1, -1) or (-1, %d), but its shape is (%s).",
|
||||
quantMode, xShape->GetDim(1), ops::Shape2String(*scaleShape).c_str()),
|
||||
return ge::GRAPH_FAILED);
|
||||
}
|
||||
} else {
|
||||
OPS_LOG_E(
|
||||
context->GetNodeName(),
|
||||
"When quant_mode is %ld, the dimNum of scale should be 1(dynamic graph) or 2, but its shape is (%ld).",
|
||||
scaleShape->GetDimNum());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
}
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus CheckInputOffsetShape(gert::InferShapeContext *context,
|
||||
const gert::Shape *offsetShape, const int64_t expertStart,
|
||||
const int64_t expertEnd, const int64_t quantMode)
|
||||
{
|
||||
// The shape of offset can be none.
|
||||
if (quantMode != QuantMode::STATIC_QUANT) {
|
||||
return ge::GRAPH_SUCCESS;
|
||||
} else if (nullptr == offsetShape) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
if (offsetShape->GetDimNum() != DIM_ONE) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The dimNum of offset should be 1, current shape is (%ld).", offsetShape->GetDimNum());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
if (offsetShape->GetDim(0) != NEG_ONE && offsetShape->GetDim(0) != NEG_TWO && !isSameDim(offsetShape->GetDim(0), DIM_ONE)) {
|
||||
OPS_LOG_E(context->GetNodeName(),
|
||||
"The shape of offset should be (1,) in static graph or (-2), (-1,) in dynamic graph, current shape is (%s).",
|
||||
ops::Shape2String(*offsetShape).c_str());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus CheckInputShape(gert::InferShapeContext *context, const gert::Shape *xShape,
|
||||
const gert::Shape *expertIdxShape, const gert::Shape *scaleShape,
|
||||
const gert::Shape *offsetShape, const int64_t expertStart,
|
||||
const int64_t expertEnd, const int64_t quantMode)
|
||||
{
|
||||
// Check the shape of input_x
|
||||
if (xShape->GetDimNum() == DIM_ONE) {
|
||||
if (xShape->GetDim(0) != ge::UNKNOWN_DIM_NUM) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The dynamic dim of x should be -2, current shape is %s.",
|
||||
ops::Shape2String(*xShape).c_str());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
} else if (xShape->GetDimNum() != DIM_TWO) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The dim of x should be 2 or dynamic, current shape is %s.",
|
||||
ops::Shape2String(*xShape).c_str());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
int64_t x_n = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(0);
|
||||
int64_t cols = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(1);
|
||||
if (x_n < NEG_ONE || cols < NEG_ONE) {
|
||||
OPS_LOG_E(context->GetNodeName(), "Invalid x shape, shape is %s.", ops::Shape2String(*xShape).c_str());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// Check the shape of expert_idx
|
||||
if (expertIdxShape->GetDimNum() == DIM_ONE) {
|
||||
if (expertIdxShape->GetDim(0) != ge::UNKNOWN_DIM_NUM) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The dynamic dim of expert_idx should be -2, current shape is %s.",
|
||||
ops::Shape2String(*expertIdxShape).c_str());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
} else if (expertIdxShape->GetDimNum() != DIM_TWO) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The dim of expert_idx should be 2 or dynamic, current shape is %s.",
|
||||
ops::Shape2String(*expertIdxShape).c_str());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
int64_t expert_idx_n = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(0);
|
||||
int64_t expert_idx_k = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(1);
|
||||
if (expert_idx_n < NEG_ONE || expert_idx_k < NEG_ONE) {
|
||||
OPS_LOG_E(context->GetNodeName(), "Invalid expert_idx shape, shape is %s.",
|
||||
ops::Shape2String(*expertIdxShape).c_str());
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
if (!isSameDim(x_n, expert_idx_n)) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The first dim of x and expert_idx should be same.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
// Check the shape of scale
|
||||
if (CheckInputScaleShape(context, xShape, scaleShape, expertStart, expertEnd, quantMode) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// Check the shape of offset
|
||||
if (CheckInputOffsetShape(context, offsetShape, expertStart, expertEnd, quantMode) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static void ShowInputShapeAndAttrInfo(gert::InferShapeContext *context, const gert::Shape *xShape,
|
||||
const gert::Shape *expertIdxShape, const gert::Shape *scaleShape,
|
||||
const gert::Shape *offsetShape, const int64_t expertStart,
|
||||
const int64_t expertEnd, const int64_t quantMode, const int64_t rowIdxType)
|
||||
{
|
||||
// input_x and expert_idx are all required.
|
||||
OPS_LOG_D(context->GetNodeName(), "x shape is: %s.", ops::Shape2String(*xShape).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "expert_idx shape is: %s.", ops::Shape2String(*expertIdxShape).c_str());
|
||||
|
||||
// scale is optional and can be none.
|
||||
if (nullptr == scaleShape) {
|
||||
OPS_LOG_D(context->GetNodeName(), "scale_shape is: none.");
|
||||
} else {
|
||||
OPS_LOG_D(context->GetNodeName(), "scale_shape is: %s.", ops::Shape2String(*scaleShape).c_str());
|
||||
}
|
||||
|
||||
// offset is optional and can be none.
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin print offset_shape.");
|
||||
if (nullptr == offsetShape) {
|
||||
OPS_LOG_D(context->GetNodeName(), "offset_shape is: none.");
|
||||
} else {
|
||||
OPS_LOG_D(context->GetNodeName(), "offset_shape is: %s.", ops::Shape2String(*offsetShape).c_str());
|
||||
}
|
||||
OPS_LOG_D(context->GetNodeName(), "End print offset_shape.");
|
||||
|
||||
// Attrs are all required.
|
||||
OPS_LOG_D(context->GetNodeName(), "active_expert_range is: [%ld, %ld).", expertStart, expertEnd);
|
||||
OPS_LOG_D(context->GetNodeName(), "quant_mode is: %ld.", quantMode);
|
||||
OPS_LOG_D(context->GetNodeName(), "row_Idx_type is: %ld.", rowIdxType);
|
||||
}
|
||||
|
||||
static void ShowOutputShapeInfo(gert::InferShapeContext *context, const gert::Shape *expandedXShape,
|
||||
const gert::Shape *expandedRowIdxShape,
|
||||
const gert::Shape *expertTokenCumsumOrCountShape, const gert::Shape *expandedScaleShape)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "expanded_x shape is: %s after infershape.",
|
||||
ops::Shape2String(*expandedXShape).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "expanded_row_idx shape is: %s after infershape.",
|
||||
ops::Shape2String(*expandedRowIdxShape).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "expert_token_cumsum_or_count shape is: %s after infershape.",
|
||||
ops::Shape2String(*expertTokenCumsumOrCountShape).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "expanded_scale shape is: %s after infershape.",
|
||||
ops::Shape2String(*expandedScaleShape).c_str());
|
||||
}
|
||||
|
||||
static ge::graphStatus InferShape4MoeInitRoutingCustom(gert::InferShapeContext *context)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do MoeInitRoutingCustomInfershape.");
|
||||
// 1. Get and check input shape
|
||||
// 1.1 Get and check input_x
|
||||
const gert::Shape *xShape = context->GetInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_X);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, xShape);
|
||||
|
||||
// 1.2 Get and check expert_idx
|
||||
const gert::Shape *expertIdxShape = context->GetInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_EXPERT_IDX);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expertIdxShape);
|
||||
|
||||
// 1.3 Get scale shape without checking null, because scale is optional and can be none.
|
||||
const gert::Shape *scaleShape = context->GetOptionalInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_SCALE);
|
||||
|
||||
// 1.4 Get offset shape without checking null, because offset is optional and can be none.
|
||||
const gert::Shape *offsetShape = context->GetOptionalInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_OFFSET);
|
||||
// 2. Get and check attrs
|
||||
const gert::RuntimeAttrs *attrs = context->GetAttrs();
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, attrs);
|
||||
|
||||
// 2.1 Get and check expert_num attr
|
||||
int64_t experNum = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrExpertNum(attrs, context, experNum) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.2 Get and check active_expert_range attr
|
||||
int64_t expertStart = static_cast<int64_t>(-1);
|
||||
int64_t expertEnd = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrActiveExpertRange(attrs, context, expertStart, expertEnd, experNum) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
if (nullptr == attrs) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The attrs is none.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.3 Get and check drop_pad_mode attr
|
||||
int64_t dropPadMode = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrDropPadMode(attrs, context, dropPadMode) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.4 Get and check active_num attr
|
||||
int64_t activeNum = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrActiveNum(attrs, context, activeNum, dropPadMode) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.5 Get and check expert_capacity attr
|
||||
int64_t expertCapacity = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrExpertCapacity(attrs, context, xShape, expertCapacity, dropPadMode) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.6 Get and check expert_token_num_type attr
|
||||
int64_t expertTokenNumType = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrExpertTokenNumType(attrs, context, expertTokenNumType) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.7 Get and check expert_token_num_type attr
|
||||
bool expertTokenNumFlag = false;
|
||||
if (GetAndCheckAttrExpertTokenNumFlag(attrs, context, expertTokenNumFlag) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.8 Get and check quant_mode attr
|
||||
int64_t quantMode = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrQuantMode(attrs, context, quantMode) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 2.9 Get and check row_Idx_type attr
|
||||
int64_t rowIdxType = static_cast<int64_t>(-1);
|
||||
if (GetAndCheckAttrRowIdxType(attrs, context, rowIdxType, dropPadMode) != ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// Check input shape
|
||||
if (CheckInputShape(context, xShape, expertIdxShape, scaleShape, offsetShape, expertStart, expertEnd, quantMode) !=
|
||||
ge::GRAPH_SUCCESS) {
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
// 3. Infer output shape
|
||||
// 3.1 Prepare output shape
|
||||
gert::Shape *expandedXShape = context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expandedXShape);
|
||||
gert::Shape *expandedRowIdxShape = context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expandedRowIdxShape);
|
||||
gert::Shape *expertTokenCumsumOrCountShape =
|
||||
context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expertTokenCumsumOrCountShape);
|
||||
gert::Shape *expandedScaleShape = context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expandedScaleShape);
|
||||
|
||||
int64_t x_n = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(0);
|
||||
int64_t cols = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(1);
|
||||
|
||||
int64_t expert_idx_n = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(0);
|
||||
int64_t k = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(1);
|
||||
int64_t n = x_n > expert_idx_n ? x_n : expert_idx_n;
|
||||
if (activeNum == 0 || activeNum == -1) {
|
||||
activeNum = n * k;
|
||||
} else {
|
||||
activeNum = std::min(activeNum, n * k);
|
||||
}
|
||||
|
||||
int64_t xOutDimNum = activeNum < n * k ? activeNum : n * k;
|
||||
int64_t outNum = (n == NEG_ONE || k == NEG_ONE) ? NEG_ONE : n * k;
|
||||
int64_t xOutNum = (n == NEG_ONE || k == NEG_ONE) ? NEG_ONE : xOutDimNum;
|
||||
// 3.2 Set output expanded_x shape
|
||||
if (dropPadMode == DropPadMode::NO_DROP_PAD) {
|
||||
expandedXShape->SetDimNum(DIM_TWO);
|
||||
expandedXShape->SetDim(0U, xOutNum);
|
||||
expandedXShape->SetDim(DIM_ONE, cols);
|
||||
} else {
|
||||
expandedXShape->SetDimNum(DIM_THREE);
|
||||
expandedXShape->SetDim(0U, experNum);
|
||||
expandedXShape->SetDim(DIM_ONE, expertCapacity);
|
||||
expandedXShape->SetDim(DIM_TWO, cols);
|
||||
}
|
||||
|
||||
// 3.3 Set output expanded_row_idx shape
|
||||
expandedRowIdxShape->SetDimNum(DIM_ONE);
|
||||
expandedRowIdxShape->SetDim(0U, outNum);
|
||||
|
||||
// 3.4 Set output expert_token_cumsum_or_count shape
|
||||
if (expertTokenNumFlag) {
|
||||
if (expertTokenNumType == ExpertTokenNumType::KEY_VALUE) {
|
||||
expertTokenCumsumOrCountShape->SetDimNum(DIM_TWO);
|
||||
expertTokenCumsumOrCountShape->SetDim(0U, experNum);
|
||||
expertTokenCumsumOrCountShape->SetDim(DIM_ONE, KEY_VALUE_MODE_DIM0_NUM);
|
||||
} else {
|
||||
expertTokenCumsumOrCountShape->SetDimNum(DIM_ONE);
|
||||
expertTokenCumsumOrCountShape->SetDim(0U, expertEnd - expertStart);
|
||||
}
|
||||
}
|
||||
|
||||
// 3.5 Set output expanded_scale shape
|
||||
// When scale_shape=(b*s) and non-quant, or it is dynamic quant mode, the shape of expanded_scale should be (b*s*k)
|
||||
if (QuantMode::NON_QUANT == quantMode || QuantMode::DYNAMIC_QUANT == quantMode) {
|
||||
expandedScaleShape->SetDimNum(DIM_ONE);
|
||||
if (dropPadMode == DropPadMode::NO_DROP_PAD) {
|
||||
expandedScaleShape->SetDim(0U, xOutNum);
|
||||
} else {
|
||||
expandedScaleShape->SetDim(0U, experNum * expertCapacity);
|
||||
}
|
||||
}
|
||||
|
||||
ShowOutputShapeInfo(context, expandedXShape, expandedRowIdxShape, expertTokenCumsumOrCountShape,
|
||||
expandedScaleShape);
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do MoeInitRoutingCustomInfershape.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus InferDataType4MoeInitRoutingCustom(gert::InferDataTypeContext *context)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do MoeInitRoutingCustomInferDataType.");
|
||||
|
||||
// Get and check quant_mode attr
|
||||
const gert::RuntimeAttrs *attrs = context->GetAttrs();
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, attrs);
|
||||
int64_t quantMode = static_cast<int64_t>(-1);
|
||||
const int64_t *quantModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_QUANT_MODE);
|
||||
if (nullptr == quantModePtr) {
|
||||
OPS_LOG_E(context->GetNodeName(), "The quant_mode should be %d, %d or %d. But it is none.", QuantMode::NON_QUANT,
|
||||
QuantMode::STATIC_QUANT, QuantMode::DYNAMIC_QUANT);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
quantMode = *quantModePtr;
|
||||
// Infer output dtype according quant_mode
|
||||
auto xDtype = context->GetInputDataType(MOE_INIT_ROUTING_CUSTOM_INPUT_X);
|
||||
if (QuantMode::NON_QUANT == quantMode) {
|
||||
context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X, xDtype);
|
||||
} else if (QuantMode::STATIC_QUANT == quantMode || QuantMode::DYNAMIC_QUANT == quantMode) {
|
||||
if (ge::DT_INT8 == xDtype) {
|
||||
OPS_LOG_E(context->GetNodeName(), "When quant_mode=%ld, xDtype cannot be int_8.", quantMode);
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X, ge::DT_INT8);
|
||||
}
|
||||
context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX, ge::DT_INT32);
|
||||
context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT, ge::DT_INT64);
|
||||
context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE, ge::DT_FLOAT);
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do MoeInitRoutingCustomInferDataType.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
static ge::graphStatus InferShapeRange4MoeInitRoutingCustom(gert::InferShapeRangeContext *context)
|
||||
{
|
||||
OPS_LOG_D(context->GetNodeName(), "Begin to do MoeInitRoutingCustomInferRange.");
|
||||
|
||||
// Get and check the pointers of all the outputs' shape range object
|
||||
auto expanded_x = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expanded_x);
|
||||
auto expanded_row_idx = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expanded_row_idx);
|
||||
auto count = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, count);
|
||||
auto expanded_scale = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE);
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, expanded_scale);
|
||||
|
||||
// Print the shape ranges of the outputs before InferShapeRange
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_x->GetMin() = %s",
|
||||
ops::Shape2String(*(expanded_x->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_x->GetMax() = %s",
|
||||
ops::Shape2String(*(expanded_x->GetMax())).c_str());
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_row_idx->GetMin() = %s",
|
||||
ops::Shape2String(*(expanded_row_idx->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_row_idx->GetMax() = %s",
|
||||
ops::Shape2String(*(expanded_row_idx->GetMax())).c_str());
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, count->GetMin() = %s",
|
||||
ops::Shape2String(*(count->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, count->GetMax() = %s",
|
||||
ops::Shape2String(*(count->GetMax())).c_str());
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_scale->GetMin() = %s",
|
||||
ops::Shape2String(*(expanded_scale->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_scale->GetMax() = %s",
|
||||
ops::Shape2String(*(expanded_scale->GetMax())).c_str());
|
||||
|
||||
// Set the dim num and dim of the outputs' shape range object
|
||||
if (expanded_x->GetMin() != nullptr && expanded_x->GetMax() != nullptr) {
|
||||
expanded_x->GetMin()->SetDimNum(DIM_TWO);
|
||||
expanded_x->GetMax()->SetDimNum(DIM_TWO);
|
||||
for (size_t i = 0; i < DIM_TWO; i++) {
|
||||
expanded_x->GetMin()->SetDim(i, 0);
|
||||
expanded_x->GetMax()->SetDim(i, -1);
|
||||
}
|
||||
}
|
||||
|
||||
if (expanded_row_idx->GetMin() != nullptr && expanded_row_idx->GetMax() != nullptr) {
|
||||
expanded_row_idx->GetMin()->SetDimNum(DIM_ONE);
|
||||
expanded_row_idx->GetMax()->SetDimNum(DIM_ONE);
|
||||
expanded_row_idx->GetMin()->SetDim(0, 0);
|
||||
expanded_row_idx->GetMax()->SetDim(0, -1);
|
||||
}
|
||||
|
||||
if (count->GetMin() != nullptr && count->GetMax() != nullptr) {
|
||||
count->GetMin()->SetDimNum(DIM_ONE);
|
||||
count->GetMax()->SetDimNum(DIM_ONE);
|
||||
count->GetMin()->SetDim(0, 0);
|
||||
count->GetMax()->SetDim(0, -1);
|
||||
}
|
||||
|
||||
if (expanded_scale->GetMin() != nullptr && expanded_scale->GetMax() != nullptr) {
|
||||
expanded_scale->GetMin()->SetDimNum(DIM_ONE);
|
||||
expanded_scale->GetMax()->SetDimNum(DIM_ONE);
|
||||
expanded_scale->GetMin()->SetDim(0, 0);
|
||||
expanded_scale->GetMax()->SetDim(0, -1);
|
||||
}
|
||||
|
||||
// Print the shape ranges of the outputs after InferShapeRange
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_x->GetMin() = %s",
|
||||
ops::Shape2String(*(expanded_x->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_x->GetMax() = %s",
|
||||
ops::Shape2String(*(expanded_x->GetMax())).c_str());
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_row_idx->GetMin() = %s",
|
||||
ops::Shape2String(*(expanded_row_idx->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_row_idx->GetMax() = %s",
|
||||
ops::Shape2String(*(expanded_row_idx->GetMax())).c_str());
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, count->GetMin() = %s",
|
||||
ops::Shape2String(*(count->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, count->GetMax() = %s",
|
||||
ops::Shape2String(*(count->GetMax())).c_str());
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_scale->GetMin() = %s",
|
||||
ops::Shape2String(*(expanded_scale->GetMin())).c_str());
|
||||
OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_scale->GetMax() = %s",
|
||||
ops::Shape2String(*(expanded_scale->GetMax())).c_str());
|
||||
|
||||
OPS_LOG_D(context->GetNodeName(), "End to do MoeInitRoutingCustomInferRange.");
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
IMPL_OP_INFERSHAPE(MoeInitRoutingCustom)
|
||||
.InferShape(InferShape4MoeInitRoutingCustom)
|
||||
.InferDataType(InferDataType4MoeInitRoutingCustom)
|
||||
.InferShapeRange(InferShapeRange4MoeInitRoutingCustom);
|
||||
} // namespace ops
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,143 @@
|
||||
/**
|
||||
* This program is free software, you can redistribute it and/or modify.
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_init_routing_custom_tiling.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef AIR_CXX_RUNTIME_V2_OP_IMPL_MOE_INIT_ROUTING_CUSTOM_H
|
||||
#define AIR_CXX_RUNTIME_V2_OP_IMPL_MOE_INIT_ROUTING_CUSTOM_H
|
||||
#include "register/tilingdata_base.h"
|
||||
#include "tiling/tiling_api.h"
|
||||
|
||||
|
||||
namespace optiling {
|
||||
BEGIN_TILING_DATA_DEF(MoeCustomVBSComputeTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, oneLoopMaxElements);
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeCustomVBSComputeTilingDataOp, MoeCustomVBSComputeTilingData)
|
||||
|
||||
BEGIN_TILING_DATA_DEF(MoeCustomVMSMiddleComputeTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeCustomVMSMiddleComputeTilingDataOp, MoeCustomVMSMiddleComputeTilingData)
|
||||
|
||||
BEGIN_TILING_DATA_DEF(MoeCustomSortOutComputeTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, oneLoopMaxElements);
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeCustomSortOutComputeTilingDataOp, MoeCustomSortOutComputeTilingData)
|
||||
|
||||
BEGIN_TILING_DATA_DEF(MoeCustomExpertTokensCountTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopElements);
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeCustomExpertTokensCountTilingDataOp, MoeCustomExpertTokensCountTilingData)
|
||||
|
||||
BEGIN_TILING_DATA_DEF(MoeCustomGatherOutComputeTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreIndicesElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreIndicesElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreIndicesLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopIndicesElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopIndicesElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreIndicesLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopIndicesElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopIndicesElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, colsLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perLoopCols);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastLoopCols);
|
||||
TILING_DATA_FIELD_DEF(int64_t, activeNum);
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeCustomGatherOutComputeTilingDataOp, MoeCustomGatherOutComputeTilingData)
|
||||
|
||||
BEGIN_TILING_DATA_DEF(MoeCustomSrcToDstCapacityComputeTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreRows);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopRows);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopRows);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreRows);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopRows);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopRows);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perLoopCols);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastLoopCols);
|
||||
TILING_DATA_FIELD_DEF(int64_t, colLoops);
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeCustomSrcToDstCapacityComputeTilingDataOp, MoeCustomSrcToDstCapacityComputeTilingData)
|
||||
|
||||
BEGIN_TILING_DATA_DEF(MoeCustomSrcToDstComputeTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
|
||||
TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops)
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeCustomSrcToDstComputeTilingDataOp, MoeCustomSrcToDstComputeTilingData)
|
||||
|
||||
BEGIN_TILING_DATA_DEF(MoeInitRoutingCustomTilingData)
|
||||
TILING_DATA_FIELD_DEF(int64_t, coreNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, n);
|
||||
TILING_DATA_FIELD_DEF(int64_t, cols);
|
||||
TILING_DATA_FIELD_DEF(int64_t, k);
|
||||
TILING_DATA_FIELD_DEF(int64_t, expertStart);
|
||||
TILING_DATA_FIELD_DEF(int64_t, expertEnd);
|
||||
TILING_DATA_FIELD_DEF(int64_t, actualExpertNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, quantMode);
|
||||
TILING_DATA_FIELD_DEF(int64_t, rowIdxType);
|
||||
TILING_DATA_FIELD_DEF(int64_t, isInputScale);
|
||||
TILING_DATA_FIELD_DEF(int64_t, isInputOffset);
|
||||
TILING_DATA_FIELD_DEF(int64_t, expertNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, expertTokensNumType);
|
||||
TILING_DATA_FIELD_DEF(int64_t, expertTokensNumFlag);
|
||||
TILING_DATA_FIELD_DEF(int64_t, gatherFirstFullload);
|
||||
TILING_DATA_FIELD_DEF(int64_t, ep);
|
||||
TILING_DATA_FIELD_DEF(int64_t, activeNum);
|
||||
TILING_DATA_FIELD_DEF(int64_t, dropPadMode);
|
||||
TILING_DATA_FIELD_DEF(int64_t, smoothType);
|
||||
TILING_DATA_FIELD_DEF(int64_t, expertCountElements);
|
||||
TILING_DATA_FIELD_DEF(int64_t, expertCapacity);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomVBSComputeTilingData, vbsComputeParamsOp);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomVMSMiddleComputeTilingData, vmsMiddleComputeParamsOp);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSortOutComputeTilingData, sortOutComputeParamsOp);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomExpertTokensCountTilingData, expertTokensCountTilingDataOp);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomGatherOutComputeTilingData, gatherOutComputeParamsOp);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSrcToDstCapacityComputeTilingData, srcToDstDropPadParamsOp);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSrcToDstCapacityComputeTilingData, srcToDstDropPadDynamicParamsOp);
|
||||
TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSrcToDstComputeTilingData, srcToDstComputeParamsOp);
|
||||
END_TILING_DATA_DEF;
|
||||
REGISTER_TILING_DATA_CLASS(MoeInitRoutingCustom, MoeInitRoutingCustomTilingData)
|
||||
struct MoeInitRoutingCustomCompileInfo {
|
||||
int32_t aivNum = 0;
|
||||
uint64_t ubSize = 0;
|
||||
platform_ascendc::SocVersion socVersion = platform_ascendc::SocVersion::ASCEND910B;
|
||||
};
|
||||
} // namespace optiling
|
||||
#endif
|
||||
@@ -0,0 +1,68 @@
|
||||
/**
|
||||
* This program is free software, you can redistribute it and/or modify.
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This file is a part of the CANN Open Software.
|
||||
* Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_init_routing_custom_tiling_base.cpp
|
||||
* \brief
|
||||
*/
|
||||
#include "moe_init_routing_custom_tiling.h"
|
||||
#include "register/op_def_registry.h"
|
||||
#include "tiling/tiling_templates_registry.h"
|
||||
|
||||
#define unlikely(x) __builtin_expect((x), 0)
|
||||
|
||||
#define OP_CHECK_NULL_WITH_CONTEXT(context, ptr) \
|
||||
do { \
|
||||
if (unlikely((ptr) == nullptr)) { \
|
||||
const char* name = (unlikely(((context) == nullptr) || (context)->GetNodeName() == nullptr)) ? \
|
||||
"nil" : \
|
||||
(context)->GetNodeName(); \
|
||||
OPS_LOG_E(name, "%s is nullptr!", #ptr); \
|
||||
return ge::GRAPH_FAILED; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
namespace optiling {
|
||||
static ge::graphStatus TilingForMoeInitRoutingCustom(gert::TilingContext *context)
|
||||
{
|
||||
return TilingRegistry::GetInstance().DoTilingImpl(context);
|
||||
}
|
||||
|
||||
static ge::graphStatus TilingPrepareForMoeInitRountingCustom(gert::TilingParseContext* context)
|
||||
{
|
||||
OPS_LOG_D(context, "TilingPrepareForMoeInitRountingCustom enter.");
|
||||
|
||||
auto compileInfo = context->GetCompiledInfo<MoeInitRoutingCustomCompileInfo>();
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, compileInfo);
|
||||
auto platformInfo = context->GetPlatformInfo();
|
||||
OP_CHECK_NULL_WITH_CONTEXT(context, platformInfo);
|
||||
auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
|
||||
compileInfo->aivNum = ascendcPlatform.GetCoreNumAiv();
|
||||
if (compileInfo->aivNum <= 0) {
|
||||
OPS_LOG_E(context, "TilingPrepareForMoeInitRountingCustom fail to get core num.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
uint64_t ubSize;
|
||||
ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
|
||||
compileInfo->ubSize = static_cast<int64_t>(ubSize);
|
||||
compileInfo->socVersion = ascendcPlatform.GetSocVersion();
|
||||
if (compileInfo->ubSize <= 0) {
|
||||
OPS_LOG_E(context, "TilingPrepareForMoeInitRountingCustom fail to get ub size.");
|
||||
return ge::GRAPH_FAILED;
|
||||
}
|
||||
|
||||
return ge::GRAPH_SUCCESS;
|
||||
}
|
||||
|
||||
IMPL_OP_OPTILING(MoeInitRoutingCustom)
|
||||
.Tiling(TilingForMoeInitRoutingCustom)
|
||||
.TilingParse<MoeInitRoutingCustomCompileInfo>(TilingPrepareForMoeInitRountingCustom);
|
||||
} // namespace optiling
|
||||
110
csrc/moe_init_routing_custom/op_kernel/moe_custom_common.h
Normal file
110
csrc/moe_init_routing_custom/op_kernel/moe_custom_common.h
Normal file
@@ -0,0 +1,110 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_common.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_COMMON_H
|
||||
#define MOE_CUSTOM_COMMON_H
|
||||
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
constexpr int64_t SPLIT_N = 0;
|
||||
constexpr int64_t SPLIT_K = 1;
|
||||
constexpr float MIN_FP32 = -3.4e38f;
|
||||
constexpr int64_t FP32_ONE_REPEAT_NUM = 64;
|
||||
constexpr int64_t ONE_REPEAT_SORT_NUM = 32;
|
||||
constexpr int64_t ONE_REPEAT_COMPARE_NUM = 64;
|
||||
constexpr int64_t BLOCK_BYTES = 32;
|
||||
constexpr int64_t INT32_ONE_BLOCK_NUM = 8;
|
||||
constexpr int64_t FP32_ONE_BLOCK_NUM = 8;
|
||||
constexpr int64_t DROPLESS_MODE = 0;
|
||||
constexpr int64_t DROP_PAD_MODE = 1;
|
||||
constexpr int64_t ASSIST_NUM = 256;
|
||||
constexpr int64_t ASSIST_INDEX_NUM = 32;
|
||||
constexpr int64_t MRGSORT_LIST_MAX_ELEMENT = 2040;
|
||||
constexpr float MAX_INT8 = 127.0f;
|
||||
constexpr uint32_t INF = 0xFF7FFFFF;
|
||||
|
||||
constexpr int64_t MERGE_LIST_TWO = 2;
|
||||
constexpr int64_t MERGE_LIST_THREE = 3;
|
||||
constexpr int64_t MERGE_LIST_FOUR = 4;
|
||||
|
||||
constexpr int64_t MERGE_LIST_IDX_TWO = 2;
|
||||
constexpr int64_t MERGE_LIST_IDX_THREE = 3;
|
||||
|
||||
constexpr int64_t GATHER = 0;
|
||||
constexpr int64_t SCATTER = 1;
|
||||
|
||||
static constexpr int64_t NO_SCALE = 0;
|
||||
static constexpr int64_t SCALE_1H = 1;
|
||||
static constexpr int64_t SCALE_EH = 2;
|
||||
|
||||
constexpr int64_t EXERPT_TOKENS_CUMSUM = 0;
|
||||
constexpr int64_t EXERPT_TOKENS_COUNT = 1;
|
||||
constexpr int64_t EXERPT_TOKENS_KEY_VALUE = 2;
|
||||
constexpr int64_t EXERPT_TOKENS_NONE = 0;
|
||||
|
||||
const __gm__ int32_t assist[256] = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0,
|
||||
4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0,
|
||||
8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0,
|
||||
12, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0,
|
||||
16, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0,
|
||||
20, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0,
|
||||
24, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0, 0, 0, 26, 0, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0,
|
||||
28, 0, 0, 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
__aicore__ inline int64_t Ceil(int64_t a, int64_t b)
|
||||
{
|
||||
if (b == 0) {
|
||||
return 0;
|
||||
}
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
__aicore__ inline int64_t Align(int64_t elementNum, int64_t bytes)
|
||||
{
|
||||
if (bytes == 0) {
|
||||
return 0;
|
||||
}
|
||||
return (elementNum * bytes + BLOCK_BYTES - 1) / BLOCK_BYTES * BLOCK_BYTES / bytes;
|
||||
}
|
||||
|
||||
__aicore__ inline int64_t AlignBytes(int64_t elementNum, int64_t bytes)
|
||||
{
|
||||
return (elementNum * bytes + BLOCK_BYTES - 1) / BLOCK_BYTES * BLOCK_BYTES;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline T Min(T a, T b)
|
||||
{
|
||||
return a > b ? b : a;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline T Max(T a, T b)
|
||||
{
|
||||
return a < b ? b : a;
|
||||
}
|
||||
|
||||
template <HardEvent event>
|
||||
__aicore__ inline void SetWaitFlag(HardEvent evt)
|
||||
{
|
||||
event_t eventId = static_cast<event_t>(GetTPipePtr()->FetchEventID(evt));
|
||||
SetFlag<event>(eventId);
|
||||
WaitFlag<event>(eventId);
|
||||
}
|
||||
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_COMMON_H
|
||||
@@ -0,0 +1,371 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_expert_tokens_count.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_EXPERT_TOKENS_COUNT_H
|
||||
#define MOE_CUSTOM_EXPERT_TOKENS_COUNT_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
constexpr int64_t EXPERT_ID_VALUE_NUM = 2;
|
||||
constexpr int64_t CUMSUM_MODE = 0;
|
||||
constexpr int64_t COUNT_MODE = 1;
|
||||
constexpr int64_t KEY_VALUE_MODE = 2;
|
||||
constexpr int64_t KEY_VALUE_MODE_DIM_NUM = 2;
|
||||
constexpr int64_t GATHER_SORT_CORE_NUM = 16;
|
||||
constexpr int64_t DROP_LESS = 0;
|
||||
constexpr int64_t DROP_PAD = 1;
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
class ExpertTokensCount {
|
||||
public:
|
||||
__aicore__ inline ExpertTokensCount(){};
|
||||
template <bool CALC_ACTUAL_EXPERT_NUM>
|
||||
__aicore__ inline void Init(GM_ADDR expandedRowIdx, GM_ADDR expertTokensCount, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn(int64_t loop, int64_t curLoopElements);
|
||||
__aicore__ inline void Compute(int64_t curLoopElements);
|
||||
__aicore__ inline void CopyOut();
|
||||
__aicore__ inline void CopyOutExpertTotalCount();
|
||||
|
||||
__aicore__ inline void expertCountCopyIn();
|
||||
__aicore__ inline void expertCountCompute();
|
||||
__aicore__ inline void expertCountCopyOut();
|
||||
|
||||
private:
|
||||
GlobalTensor<int32_t> sortedexpertIdxGm_;
|
||||
GlobalTensor<int32_t> expertCountTempGm_;
|
||||
GlobalTensor<int64_t> expertTokensCountGm_;
|
||||
GlobalTensor<int32_t> expertTotalCountGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<int32_t> expertIdxValueGm_;
|
||||
TPipe *pipe_;
|
||||
|
||||
TQue<QuePosition::VECIN, 1> sortedExpertIdxInQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> expertCountOutToTempQueue_;
|
||||
TQue<QuePosition::VECIN, 1> expertCountTempInQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> expertIdxCountOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> expertTotalCountQueue_;
|
||||
|
||||
const MoeCustomExpertTokensCountTilingData *expertTokensCountTilingData_;
|
||||
int64_t coreNum_;
|
||||
int64_t blockIdx_;
|
||||
int64_t needCoreNum_;
|
||||
int64_t perCoreElements_;
|
||||
int64_t curCoreElements_ = 0;
|
||||
int64_t expertStart_ = 0;
|
||||
int64_t expertEnd_ = 0;
|
||||
int64_t actualExpertNum_ = 0;
|
||||
int64_t coreLoopsNum_ = 0;
|
||||
int64_t perCorePerLoopElements_ = 0;
|
||||
int64_t perCoreLastLoopElements_ = 0;
|
||||
int64_t actualExpertTotalNum_ = 0;
|
||||
int64_t expertNum_ = 0;
|
||||
int64_t expertCountElements_ = 0;
|
||||
bool expertTokensNumFlag_ = false;
|
||||
int64_t dropPadMode_ = 0;
|
||||
int32_t finalExpertId = -1;
|
||||
int32_t expertTokenValue = 0;
|
||||
int64_t ep_ = 0;
|
||||
int64_t rowIdxType_ = 0;
|
||||
};
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
template <bool CALC_ACTUAL_EXPERT_NUM>
|
||||
__aicore__ inline void
|
||||
ExpertTokensCount<HISTOGRAMTYPE>::Init(GM_ADDR expandedRowIdx, GM_ADDR expertTokensCount, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
coreNum_ = tilingData->coreNum;
|
||||
pipe_ = tPipe;
|
||||
expertTokensCountTilingData_ = &(tilingData->expertTokensCountTilingDataOp);
|
||||
blockIdx_ = GetBlockIdx();
|
||||
needCoreNum_ = expertTokensCountTilingData_->needCoreNum;
|
||||
perCoreElements_ = expertTokensCountTilingData_->perCoreElements;
|
||||
expertStart_ = tilingData->expertStart;
|
||||
expertEnd_ = tilingData->expertEnd;
|
||||
actualExpertNum_ = tilingData->actualExpertNum;
|
||||
expertNum_ = tilingData->expertNum;
|
||||
expertTokensNumFlag_ = tilingData->expertTokensNumFlag;
|
||||
dropPadMode_ = tilingData->dropPadMode;
|
||||
ep_ = tilingData->ep;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
curCoreElements_ = expertTokensCountTilingData_->lastCoreElements;
|
||||
coreLoopsNum_ = expertTokensCountTilingData_->lastCoreLoops;
|
||||
perCorePerLoopElements_ = expertTokensCountTilingData_->lastCorePerLoopElements;
|
||||
perCoreLastLoopElements_ = expertTokensCountTilingData_->lastCoreLastLoopElements;
|
||||
} else {
|
||||
curCoreElements_ = expertTokensCountTilingData_->perCoreElements;
|
||||
coreLoopsNum_ = expertTokensCountTilingData_->perCoreLoops;
|
||||
perCorePerLoopElements_ = expertTokensCountTilingData_->perCorePerLoopElements;
|
||||
perCoreLastLoopElements_ = expertTokensCountTilingData_->perCoreLastLoopElements;
|
||||
}
|
||||
|
||||
if (CALC_ACTUAL_EXPERT_NUM) {
|
||||
// key and value
|
||||
int64_t kvFactor = 2;
|
||||
GlobalTensor<int32_t> sortedNumGm;
|
||||
sortedNumGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(tilingData->n * tilingData->k, sizeof(int32_t)) * kvFactor * kvFactor);
|
||||
int32_t totalSortedNum = 0;
|
||||
for (int32_t i = 0; i < 16; i++) {
|
||||
totalSortedNum += sortedNumGm.GetValue(i);
|
||||
}
|
||||
perCoreElements_ = Ceil(totalSortedNum, GetBlockNum());
|
||||
needCoreNum_ = Ceil(totalSortedNum, perCoreElements_);
|
||||
int64_t lastCoreElements = totalSortedNum - (needCoreNum_ - 1) * perCoreElements_;
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
curCoreElements_ = lastCoreElements;
|
||||
} else {
|
||||
curCoreElements_ = perCoreElements_;
|
||||
}
|
||||
coreLoopsNum_ = Ceil(curCoreElements_, expertTokensCountTilingData_->perCorePerLoopElements);
|
||||
perCorePerLoopElements_ = Ceil(curCoreElements_, coreLoopsNum_);
|
||||
perCoreLastLoopElements_ = curCoreElements_ - (coreLoopsNum_ - 1) * perCorePerLoopElements_;
|
||||
}
|
||||
|
||||
if constexpr (HISTOGRAMTYPE == KEY_VALUE_MODE) {
|
||||
expertCountElements_ = ((actualExpertNum_ + 1) < expertNum_) ? (actualExpertNum_ + 1) * KEY_VALUE_MODE_DIM_NUM :
|
||||
expertNum_ * KEY_VALUE_MODE_DIM_NUM;
|
||||
} else {
|
||||
expertCountElements_ = actualExpertNum_;
|
||||
}
|
||||
sortedexpertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + blockIdx_ * perCoreElements_, curCoreElements_);
|
||||
expertTokensCountGm_.SetGlobalBuffer((__gm__ int64_t *)expertTokensCount, expertCountElements_);
|
||||
expertCountTempGm_.SetGlobalBuffer(
|
||||
(__gm__ int32_t *)workspace + Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2, actualExpertNum_);
|
||||
expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2 +
|
||||
Align(actualExpertNum_, sizeof(int32_t)),
|
||||
actualExpertNum_);
|
||||
expertIdxValueGm_.SetGlobalBuffer(
|
||||
(__gm__ int32_t *)workspace + Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2 +
|
||||
Align((actualExpertNum_), sizeof(int32_t)) + Align((actualExpertNum_), sizeof(int32_t)),
|
||||
coreNum_ * 2);
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreElements_,
|
||||
curCoreElements_);
|
||||
|
||||
if ((tilingData->rowIdxType == GATHER) && (blockIdx_ < needCoreNum_)) {
|
||||
InitGlobalMemory(expandedRowIdxGm_, curCoreElements_, -1);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
int64_t sortedExpertIdxInLen = Max(perCorePerLoopElements_, perCoreLastLoopElements_);
|
||||
|
||||
pipe_->InitBuffer(sortedExpertIdxInQueue_, 1, AlignBytes(sortedExpertIdxInLen, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(expertCountOutToTempQueue_, 1, AlignBytes(actualExpertNum_, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(expertCountTempInQueue_, 1, AlignBytes(actualExpertNum_, sizeof(int32_t)));
|
||||
|
||||
pipe_->InitBuffer(expertIdxCountOutQueue_, 1, AlignBytes(expertCountElements_, sizeof(int64_t)));
|
||||
pipe_->InitBuffer(expertTotalCountQueue_, 1, AlignBytes(1, sizeof(int32_t)));
|
||||
|
||||
if (blockIdx_ == 0) {
|
||||
InitGlobalMemory(expertTotalCountGm_, 1, 0);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
SyncAll();
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::Process()
|
||||
{
|
||||
if (blockIdx_ < needCoreNum_) {
|
||||
for (int64_t i = 0; i < coreLoopsNum_; i++) {
|
||||
int64_t perLoopElements = (i == (coreLoopsNum_ - 1)) ? perCoreLastLoopElements_ : perCorePerLoopElements_;
|
||||
CopyIn(i, perLoopElements);
|
||||
Compute(perLoopElements);
|
||||
CopyOut();
|
||||
}
|
||||
if (ep_ == 1) {
|
||||
CopyOutExpertTotalCount();
|
||||
}
|
||||
}
|
||||
if (ep_ == 1 || expertTokensNumFlag_ || dropPadMode_ == 1) {
|
||||
SyncAll();
|
||||
}
|
||||
/* copy expert tokens count result from worksapce to output GM. */
|
||||
if (blockIdx_ == 0 && expertTokensNumFlag_) {
|
||||
expertCountCopyIn();
|
||||
expertCountCompute();
|
||||
expertCountCopyOut();
|
||||
}
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::CopyIn(int64_t loop, int64_t curLoopElements)
|
||||
{
|
||||
LocalTensor<int32_t> sortedExpertIdxInLocal = sortedExpertIdxInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopElements * sizeof(int32_t)),
|
||||
0, 0, 0};
|
||||
DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
|
||||
int64_t sortedexpertIdxOffset = loop * perCorePerLoopElements_;
|
||||
DataCopyPad(sortedExpertIdxInLocal, sortedexpertIdxGm_[sortedexpertIdxOffset], dataCopyParams, dataCopyPadParams);
|
||||
sortedExpertIdxInQueue_.EnQue(sortedExpertIdxInLocal);
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::Compute(int64_t curLoopElements)
|
||||
{
|
||||
LocalTensor<int32_t> sortedExpertIdxInLocal = sortedExpertIdxInQueue_.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expertCountOutLocal = expertCountOutToTempQueue_.AllocTensor<int32_t>();
|
||||
Duplicate(expertCountOutLocal.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
|
||||
static_cast<int32_t>(actualExpertNum_));
|
||||
SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
|
||||
int64_t i = 0;
|
||||
int32_t lastExpertId = sortedExpertIdxInLocal.GetValue(0);
|
||||
int32_t lastIndex = 0;
|
||||
int64_t loopTokenCount = 0;
|
||||
int32_t lastlastExpertId = lastExpertId;
|
||||
for (i = 1; i < curLoopElements; i++) {
|
||||
if ((lastExpertId >= expertEnd_) || (lastExpertId < expertStart_)) {
|
||||
break;
|
||||
}
|
||||
int32_t curExpertId = sortedExpertIdxInLocal.GetValue(i);
|
||||
if (curExpertId != lastExpertId || curExpertId >= expertEnd_) {
|
||||
if constexpr (HISTOGRAMTYPE == COUNT_MODE || HISTOGRAMTYPE == KEY_VALUE_MODE) {
|
||||
expertCountOutLocal.SetValue(lastExpertId - expertStart_, i - lastIndex);
|
||||
loopTokenCount += i - lastIndex;
|
||||
} else {
|
||||
for (int64_t j = lastlastExpertId; j < lastExpertId; j++) {
|
||||
expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
|
||||
}
|
||||
loopTokenCount += i - lastIndex;
|
||||
expertCountOutLocal.SetValue(lastExpertId - expertStart_, loopTokenCount);
|
||||
}
|
||||
lastIndex = i;
|
||||
lastlastExpertId = lastExpertId;
|
||||
lastExpertId = curExpertId;
|
||||
}
|
||||
}
|
||||
if ((i == curLoopElements) && ((lastExpertId >= expertStart_) && (lastExpertId < expertEnd_))) {
|
||||
if constexpr (HISTOGRAMTYPE == COUNT_MODE || HISTOGRAMTYPE == KEY_VALUE_MODE) {
|
||||
expertCountOutLocal.SetValue(lastExpertId - expertStart_, i - lastIndex);
|
||||
loopTokenCount += i - lastIndex;
|
||||
} else {
|
||||
for (int64_t j = lastlastExpertId; j < lastExpertId; j++) {
|
||||
expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
|
||||
}
|
||||
loopTokenCount += i - lastIndex;
|
||||
expertCountOutLocal.SetValue(lastExpertId - expertStart_, loopTokenCount);
|
||||
for (int64_t j = lastExpertId; j < expertEnd_; j++) {
|
||||
expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if constexpr (HISTOGRAMTYPE == EXERPT_TOKENS_CUMSUM) {
|
||||
for (int64_t j = lastlastExpertId; j < expertEnd_; j++) {
|
||||
expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
actualExpertTotalNum_ += loopTokenCount;
|
||||
finalExpertId = lastExpertId;
|
||||
expertTokenValue = (i - lastIndex);
|
||||
|
||||
expertCountOutToTempQueue_.EnQue<int32_t>(expertCountOutLocal);
|
||||
sortedExpertIdxInQueue_.FreeTensor(sortedExpertIdxInLocal);
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::CopyOutExpertTotalCount()
|
||||
{
|
||||
LocalTensor<int32_t> expertTotalCountLocal = expertTotalCountQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams copyTotalCountParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
expertTotalCountLocal.SetValue(0, static_cast<int32_t>(actualExpertTotalNum_));
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
SetAtomicAdd<int32_t>();
|
||||
DataCopyPad(expertTotalCountGm_, expertTotalCountLocal, copyTotalCountParams);
|
||||
SetAtomicNone();
|
||||
expertTotalCountQueue_.FreeTensor(expertTotalCountLocal);
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::CopyOut()
|
||||
{
|
||||
LocalTensor<int32_t> expertCountOutLocal = expertCountOutToTempQueue_.DeQue<int32_t>();
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>((actualExpertNum_) * sizeof(int32_t)),
|
||||
0, 0, 0};
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
SetAtomicAdd<int32_t>();
|
||||
DataCopyPad(expertCountTempGm_, expertCountOutLocal, copyParams);
|
||||
SetAtomicNone();
|
||||
|
||||
if (dropPadMode_ == DROP_PAD) {
|
||||
expertCountOutLocal.SetValue(0, finalExpertId);
|
||||
expertCountOutLocal.SetValue(1, expertTokenValue);
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(EXPERT_ID_VALUE_NUM * sizeof(int32_t)), 0, 0, 0};
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyPad(expertIdxValueGm_[blockIdx_ * EXPERT_ID_VALUE_NUM], expertCountOutLocal, copyParams);
|
||||
}
|
||||
expertCountOutToTempQueue_.FreeTensor(expertCountOutLocal);
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::expertCountCopyIn()
|
||||
{
|
||||
LocalTensor<int32_t> expertCountTempInLocal = expertCountTempInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>((actualExpertNum_) * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(expertCountTempInLocal, expertCountTempGm_, dataCopyParams, dataCopyPadParams);
|
||||
expertCountTempInQueue_.EnQue(expertCountTempInLocal);
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::expertCountCompute()
|
||||
{
|
||||
LocalTensor<int32_t> expertCountTempInLocal = expertCountTempInQueue_.DeQue<int32_t>();
|
||||
LocalTensor<int64_t> expertCountOutLocal = expertIdxCountOutQueue_.AllocTensor<int64_t>();
|
||||
if constexpr (HISTOGRAMTYPE == KEY_VALUE_MODE) {
|
||||
int64_t expertOffset = 0;
|
||||
Duplicate(expertCountOutLocal.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
|
||||
static_cast<int32_t>(expertCountElements_ * KEY_VALUE_MODE));
|
||||
SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
|
||||
for (int64_t i = 0; i < actualExpertNum_; i++) {
|
||||
int64_t expertCount = static_cast<int64_t>(expertCountTempInLocal.GetValue(i));
|
||||
if (expertCount != 0) {
|
||||
expertCountOutLocal.SetValue(expertOffset * KEY_VALUE_MODE_DIM_NUM, i + expertStart_);
|
||||
expertCountOutLocal.SetValue(expertOffset * KEY_VALUE_MODE_DIM_NUM + 1, expertCount);
|
||||
expertOffset++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Cast(expertCountOutLocal, expertCountTempInLocal, RoundMode::CAST_NONE, actualExpertNum_);
|
||||
}
|
||||
|
||||
expertIdxCountOutQueue_.EnQue<int64_t>(expertCountOutLocal);
|
||||
expertCountTempInQueue_.FreeTensor(expertCountTempInLocal);
|
||||
}
|
||||
|
||||
template <const int HISTOGRAMTYPE>
|
||||
__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::expertCountCopyOut()
|
||||
{
|
||||
LocalTensor<int64_t> expertCountOutLocal = expertIdxCountOutQueue_.DeQue<int64_t>();
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(expertCountElements_ * sizeof(int64_t)), 0, 0, 0};
|
||||
DataCopyPad(expertTokensCountGm_, expertCountOutLocal, copyParams);
|
||||
copyParams.blockLen = sizeof(int32_t);
|
||||
expertIdxCountOutQueue_.FreeTensor(expertCountOutLocal);
|
||||
}
|
||||
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_EXPERT_TOKENS_COUNT_H
|
||||
280
csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load.h
Normal file
280
csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load.h
Normal file
@@ -0,0 +1,280 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_full_load.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_FULL_LOAD_H
|
||||
#define MOE_CUSTOM_FULL_LOAD_H
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
class MoeCustomFullLoad {
|
||||
public:
|
||||
__aicore__ inline MoeCustomFullLoad(){};
|
||||
__aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedX,
|
||||
GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void SortCompute();
|
||||
__aicore__ inline void ExpertCountCompute();
|
||||
__aicore__ inline void CopyOutDynamicQuant();
|
||||
|
||||
private:
|
||||
int64_t sortNum;
|
||||
|
||||
TPipe *pipe;
|
||||
TQue<QuePosition::VECIN, 1> sortDataCopyInQueue;
|
||||
TQue<QuePosition::VECOUT, 1> sortDataCopyOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> expertTokensCountOrCumsumOutQueue;
|
||||
TQue<QuePosition::VECIN, 1> smoothInQueue;
|
||||
TQue<QuePosition::VECIN, 1> inputXInQueue;
|
||||
TQue<QuePosition::VECOUT, 1> inputXOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> scaleOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> rowIdxOutQueue;
|
||||
|
||||
TBuf<TPosition::VECCALC> tempBuffer;
|
||||
TBuf<TPosition::VECCALC> sortedBuffer;
|
||||
TBuf<TPosition::VECCALC> quantTempBuffer;
|
||||
|
||||
GlobalTensor<bfloat16_t> inputXGm;
|
||||
GlobalTensor<float> smoothGm;
|
||||
GlobalTensor<int8_t> expandedXGm;
|
||||
GlobalTensor<float> expandedScaleGm;
|
||||
GlobalTensor<int32_t> expertIdxGm;
|
||||
GlobalTensor<int32_t> expendedRowIdxGm;
|
||||
GlobalTensor<int32_t> sortedExpertForSourceRowGm;
|
||||
GlobalTensor<int32_t> expandDstToSrcRowGm;
|
||||
GlobalTensor<int32_t> sortedexpertIdxGm;
|
||||
GlobalTensor<int32_t> expertCountTempGm;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm;
|
||||
GlobalTensor<int64_t> expertTokensCountOrCumsumGm;
|
||||
|
||||
int64_t blockIdx = 0;
|
||||
int64_t tileLength;
|
||||
int64_t bufferNum = 1;
|
||||
int64_t totalLength;
|
||||
int64_t n;
|
||||
int64_t k;
|
||||
int64_t cols_;
|
||||
int64_t expertNum_ = 256;
|
||||
int64_t rowIdxType_;
|
||||
int64_t kvFactor = 2;
|
||||
static constexpr int64_t DST_BLK_STRIDE = 1;
|
||||
static constexpr int64_t DST_REP_STRIDE = 8;
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeCustomFullLoad::CopyIn()
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(this->totalLength * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal[0], expertIdxGm, dataCopyParams, dataCopyPadParams);
|
||||
LocalTensor<int32_t> rowIdxLocal = inLocal[this->sortNum];
|
||||
ArithProgression<int32_t>(rowIdxLocal, 0, 1, this->sortNum);
|
||||
sortDataCopyInQueue.EnQue(inLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeCustomFullLoad::SortCompute()
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expertIdx = inLocal[0];
|
||||
LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
|
||||
Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, this->tileLength);
|
||||
Muls(expertIdxFp32, expertIdxFp32, (float)-1, this->tileLength);
|
||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
}
|
||||
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
Concat(concatLocal, expertIdxFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
|
||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
LocalTensor<uint32_t> sourceRowLocal;
|
||||
sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||
Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
|
||||
LocalTensor<uint32_t> expandDstToSrcRowLocal;
|
||||
expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||
Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
|
||||
|
||||
LocalTensor<int32_t> expertForSourceRowLocalInt32;
|
||||
expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
|
||||
Cast(expertForSourceRowLocalInt32, sortedExpertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
|
||||
sortDataCopyOutQueue.EnQue<float>(outLocal);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeCustomFullLoad::ExpertCountCompute()
|
||||
{
|
||||
LocalTensor<int32_t> outLocal = sortDataCopyOutQueue.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> sortedExpertId = outLocal;
|
||||
LocalTensor<int64_t> expertTokensLocalTensor = expertTokensCountOrCumsumOutQueue.AllocTensor<int64_t>();
|
||||
|
||||
int64_t i = 0;
|
||||
int32_t lastExpertId = sortedExpertId.GetValue(0);
|
||||
int32_t lastIndex = 0;
|
||||
int64_t index = 0;
|
||||
for (i = 1; i < this->totalLength; i++) {
|
||||
int32_t curExpertId = sortedExpertId.GetValue(i);
|
||||
if (curExpertId != lastExpertId) {
|
||||
expertTokensLocalTensor.SetValue(index * kvFactor, lastExpertId);
|
||||
expertTokensLocalTensor.SetValue(index * kvFactor + 1, i - lastIndex);
|
||||
index++;
|
||||
lastIndex = i;
|
||||
lastExpertId = curExpertId;
|
||||
}
|
||||
}
|
||||
if (i == this->totalLength) {
|
||||
expertTokensLocalTensor.SetValue(index * kvFactor, lastExpertId);
|
||||
expertTokensLocalTensor.SetValue(index * kvFactor + 1, i - lastIndex);
|
||||
index++;
|
||||
}
|
||||
// totalLength < 256
|
||||
expertTokensLocalTensor.SetValue(index * kvFactor, 0);
|
||||
expertTokensLocalTensor.SetValue(index * kvFactor + 1, 0);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
|
||||
expertTokensCountOrCumsumOutQueue.EnQue<int64_t>(expertTokensLocalTensor);
|
||||
sortDataCopyOutQueue.EnQue<int32_t>(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeCustomFullLoad::CopyOutDynamicQuant()
|
||||
{
|
||||
LocalTensor<int64_t> expertTokensLocalTensor = expertTokensCountOrCumsumOutQueue.DeQue<int64_t>();
|
||||
DataCopyParams intriParams;
|
||||
intriParams.blockCount = 1;
|
||||
intriParams.blockLen = expertNum_ * sizeof(int64_t);
|
||||
DataCopyPad(expertTokensCountOrCumsumGm, expertTokensLocalTensor, intriParams);
|
||||
expertTokensCountOrCumsumOutQueue.FreeTensor(expertTokensLocalTensor);
|
||||
LocalTensor<int32_t> outLocal = sortDataCopyOutQueue.DeQue<int32_t>();
|
||||
|
||||
int64_t expertIdx = outLocal.GetValue(blockIdx);
|
||||
LocalTensor<bfloat16_t> xInLocal = inputXInQueue.AllocTensor<bfloat16_t>();
|
||||
LocalTensor<int8_t> xOutLocal = inputXOutQueue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> smoothLocal = smoothInQueue.AllocTensor<float>();
|
||||
LocalTensor<float> scaleLocal = scaleOutQueue.AllocTensor<float>();
|
||||
LocalTensor<float> tempLocal = quantTempBuffer.Get<float>();
|
||||
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(cols_ * sizeof(bfloat16_t)), 0, 0, 0};
|
||||
DataCopyExtParams smoothParams{1, static_cast<uint32_t>(cols_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(cols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyPad(xInLocal, inputXGm, copyInParams, {false, 0, 0, 0});
|
||||
DataCopyPad(smoothLocal, smoothGm[expertIdx * cols_], smoothParams, {false, 0, 0, 0});
|
||||
smoothInQueue.EnQue<float>(smoothLocal);
|
||||
smoothLocal = smoothInQueue.DeQue<float>();
|
||||
Cast(tempLocal, xInLocal, RoundMode::CAST_NONE, cols_);
|
||||
Mul(smoothLocal, tempLocal, smoothLocal, cols_);
|
||||
// compute scale
|
||||
Abs(tempLocal, smoothLocal, cols_);
|
||||
ReduceMax(scaleLocal, tempLocal, tempLocal, cols_);
|
||||
float scaleValue = scaleLocal.GetValue(0) / 127.0f;
|
||||
Duplicate<float>(scaleLocal, scaleValue, DST_REP_STRIDE);
|
||||
Duplicate<float>(tempLocal, scaleValue, cols_);
|
||||
// compute quant
|
||||
Div(tempLocal, smoothLocal, tempLocal, cols_);
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_ODD, cols_); // fp32->fp16
|
||||
Cast(xOutLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_RINT, cols_); // fp16->int8
|
||||
inputXOutQueue.EnQue<int8_t>(xOutLocal);
|
||||
xOutLocal = inputXOutQueue.DeQue<int8_t>();
|
||||
scaleOutQueue.EnQue<float>(scaleLocal);
|
||||
scaleLocal = scaleOutQueue.DeQue<float>();
|
||||
DataCopyPad(expandedXGm[blockIdx * cols_], xOutLocal, copyOutParams);
|
||||
DataCopyPad(expandedScaleGm[blockIdx], scaleLocal, {1, 4, 0, 0, 0});
|
||||
smoothInQueue.FreeTensor(smoothLocal);
|
||||
inputXInQueue.FreeTensor(xInLocal);
|
||||
inputXOutQueue.FreeTensor(xOutLocal);
|
||||
scaleOutQueue.FreeTensor(scaleLocal);
|
||||
|
||||
if (blockIdx == 0) {
|
||||
intriParams.blockLen = this->totalLength * sizeof(int32_t);
|
||||
if (rowIdxType_ == 1) {
|
||||
DataCopyPad(expandedRowIdxGm, outLocal[this->sortNum], intriParams);
|
||||
} else if (rowIdxType_ == 0) {
|
||||
LocalTensor rowIdxLocalTensor = rowIdxOutQueue.AllocTensor<int32_t>();
|
||||
for (int i = 0; i < this->totalLength; i++) {
|
||||
int32_t dstIdx = outLocal[this->sortNum].GetValue(i);
|
||||
rowIdxLocalTensor.SetValue(dstIdx, i);
|
||||
}
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyPad(expandedRowIdxGm, rowIdxLocalTensor, intriParams);
|
||||
rowIdxOutQueue.FreeTensor(rowIdxLocalTensor);
|
||||
}
|
||||
}
|
||||
sortDataCopyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeCustomFullLoad::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset,
|
||||
GM_ADDR expandedX, GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
|
||||
GM_ADDR expandedScale, const MoeInitRoutingCustomTilingData *tilingData,
|
||||
TPipe *tPipe)
|
||||
{
|
||||
this->pipe = tPipe;
|
||||
this->blockIdx = GetBlockIdx();
|
||||
this->n = tilingData->n;
|
||||
this->k = tilingData->k;
|
||||
this->tileLength = Align(tilingData->vbsComputeParamsOp.lastCorePerLoopElements, sizeof(int32_t));
|
||||
this->sortNum = Ceil(this->tileLength, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
this->totalLength = tilingData->n * tilingData->k;
|
||||
cols_ = tilingData->cols;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
|
||||
expertIdxGm.SetGlobalBuffer((__gm__ int32_t *)expertIdx, this->tileLength);
|
||||
|
||||
expandedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, this->tileLength);
|
||||
expertTokensCountOrCumsumGm.SetGlobalBuffer((__gm__ int64_t *)expertTokensCountOrCumsum, this->tileLength);
|
||||
|
||||
inputXGm.SetGlobalBuffer((__gm__ bfloat16_t *)x, this->n * cols_);
|
||||
smoothGm.SetGlobalBuffer((__gm__ float *)scale, expertNum_ * cols_);
|
||||
expandedXGm.SetGlobalBuffer((__gm__ int8_t *)expandedX, this->n * cols_ * this->k);
|
||||
expandedScaleGm.SetGlobalBuffer((__gm__ float *)expandedScale, this->n * this->k);
|
||||
|
||||
// key and value
|
||||
int64_t buffSize = this->sortNum * sizeof(int32_t) * kvFactor;
|
||||
pipe->InitBuffer(sortDataCopyInQueue, bufferNum, buffSize);
|
||||
pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, buffSize);
|
||||
pipe->InitBuffer(tempBuffer, buffSize);
|
||||
pipe->InitBuffer(sortedBuffer, buffSize);
|
||||
pipe->InitBuffer(expertTokensCountOrCumsumOutQueue, bufferNum, Align(expertNum_ * kvFactor, sizeof(int32_t)));
|
||||
|
||||
pipe->InitBuffer(smoothInQueue, bufferNum, AlignBytes(cols_, sizeof(float)));
|
||||
pipe->InitBuffer(inputXInQueue, bufferNum, AlignBytes(cols_, sizeof(bfloat16_t)));
|
||||
pipe->InitBuffer(inputXOutQueue, bufferNum, AlignBytes(cols_, sizeof(int8_t)));
|
||||
pipe->InitBuffer(quantTempBuffer, AlignBytes(cols_, sizeof(float)));
|
||||
pipe->InitBuffer(scaleOutQueue, bufferNum, AlignBytes(1, sizeof(float)));
|
||||
pipe->InitBuffer(rowIdxOutQueue, bufferNum, AlignBytes(this->totalLength, sizeof(int32_t)));
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeCustomFullLoad::Process()
|
||||
{
|
||||
if (this->blockIdx < GetBlockNum()) {
|
||||
CopyIn();
|
||||
SortCompute();
|
||||
ExpertCountCompute();
|
||||
CopyOutDynamicQuant();
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_FULL_LOAD_H
|
||||
@@ -0,0 +1,512 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_base_full_load.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_FULL_LOAD_BASE_H
|
||||
#define MOE_CUSTOM_FULL_LOAD_BASE_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
template <typename T>
|
||||
class MoeCustomFullLoadBase {
|
||||
public:
|
||||
__aicore__ inline MoeCustomFullLoadBase(){};
|
||||
__aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
|
||||
GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
|
||||
protected:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void Compute();
|
||||
__aicore__ inline void TilingInKernel();
|
||||
__aicore__ inline void SortComputeWithRange();
|
||||
__aicore__ inline void SortCompute();
|
||||
__aicore__ inline void CopyOutIdx();
|
||||
__aicore__ inline void CopyOutDefaultGatherIdx();
|
||||
__aicore__ inline void CopyOutDefaultTokenCountOrCumsum();
|
||||
__aicore__ inline void ComputeExpertTokenCountOrCumsum();
|
||||
|
||||
protected:
|
||||
int64_t sortNum_;
|
||||
const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
|
||||
int64_t blockIdx_;
|
||||
int64_t needCoreNum_;
|
||||
int64_t coreIndicesElements_;
|
||||
int64_t perCoreIndicesElements_;
|
||||
int64_t k_;
|
||||
int64_t n_;
|
||||
int64_t cols_;
|
||||
int64_t dropPadMode_;
|
||||
int64_t activeNum_;
|
||||
int64_t expertNum_;
|
||||
int64_t expertStart_ = 0;
|
||||
int64_t expertEnd_ = 0;
|
||||
int64_t bufferNum_ = 1;
|
||||
int64_t kvFactor_ = 2;
|
||||
int64_t totalLength_;
|
||||
int64_t tileLength_;
|
||||
int64_t expertTokensNumType_ = 0;
|
||||
int64_t expertTokensNumFlag_ = 0;
|
||||
uint64_t actual_idx_num_ = 0;
|
||||
int64_t ep_ = 0;
|
||||
int64_t gatherFirstFullload_ = 0;
|
||||
int64_t isInputScale_ = 0;
|
||||
int64_t rowIdxType_ = 0;
|
||||
int64_t actualExpertNum_ = 0;
|
||||
int64_t expertCountElements_ = 0;
|
||||
int64_t curIndexStart_;
|
||||
int64_t startXRow_;
|
||||
int64_t endXRow_;
|
||||
int64_t quantMode_ = -1;
|
||||
|
||||
static constexpr int64_t DST_BLK_STRIDE = 1;
|
||||
static constexpr int64_t DST_REP_STRIDE = 8;
|
||||
static constexpr int64_t MASK_STRIDE = 64;
|
||||
|
||||
TQue<QuePosition::VECOUT, 1> expandedRowIdxCopyOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> expandedExpertIdxCopyOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> expandDstToSrcRowQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> expertTokensCopyOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> sortDataCopyInQueue_;
|
||||
|
||||
TBuf<TPosition::VECCALC> tempBuffer_;
|
||||
TBuf<TPosition::VECCALC> sortedBuffer_;
|
||||
|
||||
GlobalTensor<int32_t> expertIdxGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<int64_t> expertTokensCountOrCumsumGm_;
|
||||
|
||||
TPipe *pipe_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::Init(GM_ADDR expertIdx, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expertTokensCountOrCumsum, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
this->gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
|
||||
this->blockIdx_ = GetBlockIdx();
|
||||
this->n_ = tilingData->n;
|
||||
this->k_ = tilingData->k;
|
||||
this->cols_ = tilingData->cols;
|
||||
this->expertStart_ = tilingData->expertStart;
|
||||
this->expertEnd_ = tilingData->expertEnd;
|
||||
this->needCoreNum_ = this->gatherOutTilingData_->needCoreNum;
|
||||
|
||||
this->perCoreIndicesElements_ = this->gatherOutTilingData_->perCoreIndicesElements;
|
||||
this->dropPadMode_ = tilingData->dropPadMode;
|
||||
this->activeNum_ = tilingData->activeNum;
|
||||
this->quantMode_ = tilingData->quantMode;
|
||||
if (this->blockIdx_ == this->gatherOutTilingData_->needCoreNum - 1) {
|
||||
this->coreIndicesElements_ = this->gatherOutTilingData_->lastCoreIndicesElements;
|
||||
} else {
|
||||
this->coreIndicesElements_ = this->gatherOutTilingData_->perCoreIndicesElements;
|
||||
}
|
||||
this->expertTokensNumType_ = tilingData->expertTokensNumType;
|
||||
this->expertTokensNumFlag_ = tilingData->expertTokensNumFlag;
|
||||
this->expertNum_ = tilingData->expertNum;
|
||||
this->totalLength_ = tilingData->n * tilingData->k;
|
||||
this->ep_ = tilingData->ep;
|
||||
this->gatherFirstFullload_ = tilingData->gatherFirstFullload;
|
||||
this->isInputScale_ = tilingData->isInputScale;
|
||||
this->tileLength_ = Align(tilingData->vbsComputeParamsOp.lastCorePerLoopElements, sizeof(int32_t));
|
||||
this->sortNum_ = Ceil(this->tileLength_, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
this->actual_idx_num_ = this->totalLength_;
|
||||
this->rowIdxType_ = tilingData->rowIdxType;
|
||||
this->actualExpertNum_ = tilingData->actualExpertNum;
|
||||
this->pipe_ = tPipe;
|
||||
|
||||
expertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expertIdx, this->tileLength_);
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, this->tileLength_);
|
||||
if (this->expertTokensNumFlag_ > 0) {
|
||||
expertTokensCountOrCumsumGm_.SetGlobalBuffer((__gm__ int64_t *)expertTokensCountOrCumsum);
|
||||
}
|
||||
|
||||
if (expertTokensNumType_ == EXERPT_TOKENS_KEY_VALUE) {
|
||||
expertCountElements_ = expertNum_ * EXERPT_TOKENS_KEY_VALUE;
|
||||
} else {
|
||||
expertCountElements_ = actualExpertNum_;
|
||||
}
|
||||
int64_t buffSize = this->sortNum_ * sizeof(int32_t);
|
||||
|
||||
curIndexStart_ = this->blockIdx_ * this->perCoreIndicesElements_;
|
||||
startXRow_ = curIndexStart_ / this->k_;
|
||||
endXRow_ = (curIndexStart_ + this->coreIndicesElements_ - 1) / this->k_;
|
||||
|
||||
pipe_->InitBuffer(expandedExpertIdxCopyOutQueue_, bufferNum_, buffSize);
|
||||
pipe_->InitBuffer(expertTokensCopyOutQueue_, bufferNum_, AlignBytes(expertCountElements_, sizeof(int64_t)));
|
||||
pipe_->InitBuffer(expandDstToSrcRowQueue_, bufferNum_, buffSize);
|
||||
pipe_->InitBuffer(expandedRowIdxCopyOutQueue_, bufferNum_, buffSize);
|
||||
pipe_->InitBuffer(sortDataCopyInQueue_, bufferNum_, buffSize * kvFactor_);
|
||||
pipe_->InitBuffer(tempBuffer_, buffSize * kvFactor_);
|
||||
pipe_->InitBuffer(sortedBuffer_, buffSize * kvFactor_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::CopyIn()
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0,
|
||||
0, 0};
|
||||
DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal[0], expertIdxGm_, dataCopyParams, dataCopyPadParams);
|
||||
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, totalLength_);
|
||||
sortDataCopyInQueue_.EnQue(inLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::Compute()
|
||||
{
|
||||
if (ep_) {
|
||||
SortComputeWithRange();
|
||||
} else {
|
||||
SortCompute();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::SortComputeWithRange()
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue_.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
if (gatherFirstFullload_) {
|
||||
int64_t maskOffset = AlignBytes(Ceil(totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE, sizeof(int8_t));
|
||||
LocalTensor<uint8_t> compareScalarMaskLocalTensor0 = tempBuffer_.Get<uint8_t>()[maskOffset];
|
||||
LocalTensor<uint8_t> compareScalarMaskLocalTensor1 = tempBuffer_.Get<uint8_t>()[maskOffset * kvFactor_];
|
||||
LocalTensor<uint8_t> gatherMaskLocalTensor = tempBuffer_.Get<uint8_t>();
|
||||
|
||||
// Find elements >= expertStart_, which means -elements <= -expertStart_
|
||||
AscendC::CompareScalar(
|
||||
compareScalarMaskLocalTensor0, expertIdxLocalFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::LE,
|
||||
(totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
// Find elements < expertEnd_, which means -elements > -expertEnd_
|
||||
AscendC::CompareScalar(
|
||||
compareScalarMaskLocalTensor1, expertIdxLocalFp32, static_cast<float>(-expertEnd_), AscendC::CMPMODE::GT,
|
||||
(totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
And(gatherMaskLocalTensor.ReinterpretCast<uint16_t>(),
|
||||
compareScalarMaskLocalTensor0.ReinterpretCast<uint16_t>(),
|
||||
compareScalarMaskLocalTensor1.ReinterpretCast<uint16_t>(),
|
||||
Ceil(totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE / kvFactor_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
uint64_t rsvdCnt = 0;
|
||||
GatherMaskParams gatherMaskParams;
|
||||
gatherMaskParams.repeatTimes = 1;
|
||||
gatherMaskParams.src0BlockStride = 1;
|
||||
gatherMaskParams.src0RepeatStride = DST_REP_STRIDE;
|
||||
gatherMaskParams.src1RepeatStride = DST_REP_STRIDE;
|
||||
GatherMask(expertIdxLocalFp32, expertIdxLocalFp32, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
|
||||
static_cast<uint32_t>(totalLength_), gatherMaskParams, rsvdCnt);
|
||||
PipeBarrier<PIPE_V>();
|
||||
actual_idx_num_ = rsvdCnt;
|
||||
sortNum_ = Ceil(actual_idx_num_, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
|
||||
GatherMask(rowIdxLocal, rowIdxLocal, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
|
||||
static_cast<uint32_t>(totalLength_), gatherMaskParams, actual_idx_num_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
TilingInKernel();
|
||||
} else {
|
||||
LocalTensor<uint8_t> maskLocalTensor = tempBuffer_.Get<uint8_t>();
|
||||
AscendC::CompareScalar(
|
||||
maskLocalTensor, expertIdxLocalFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::GT,
|
||||
(totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
LocalTensor<float> floatMinLocalTensor = sortedBuffer_.Get<float>();
|
||||
Duplicate(floatMinLocalTensor, MIN_FP32, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Select(expertIdxLocalFp32, maskLocalTensor, floatMinLocalTensor, expertIdxLocalFp32,
|
||||
SELMODE::VSEL_TENSOR_TENSOR_MODE, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
// handle actual_idx_num_ == 0
|
||||
if (actual_idx_num_ < 1) {
|
||||
sortDataCopyInQueue_.FreeTensor(inLocal);
|
||||
return;
|
||||
}
|
||||
int64_t duplicateNum = actual_idx_num_ % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = actual_idx_num_ - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> (FP32_ONE_REPEAT_NUM - ONE_REPEAT_SORT_NUM));
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
LocalTensor<float> concatLocal = expertIdxLocalFp32;
|
||||
LocalTensor<float> tempTensor = tempBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<float> sortedLocal = sortedBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
||||
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, actual_idx_num_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, actual_idx_num_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||
expandDstToSrcRowQueue_.EnQue<uint32_t>(expandDstToSrcRowLocal);
|
||||
sortDataCopyInQueue_.FreeTensor(inLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::SortCompute()
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue_.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expertIdxLocal = inLocal[0];
|
||||
LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
|
||||
Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
int64_t duplicateNum = totalLength_ % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = totalLength_ - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> (FP32_ONE_REPEAT_NUM - ONE_REPEAT_SORT_NUM));
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
LocalTensor<float> concatLocal = expertIdxLocalFp32;
|
||||
LocalTensor<float> tempTensor = tempBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
|
||||
LocalTensor<float> sortedLocal = sortedBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
|
||||
LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
|
||||
LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
|
||||
Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
|
||||
Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> expandedExpertIdxLocalInt32;
|
||||
expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
|
||||
Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||
totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = totalLength_ - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> (FP32_ONE_REPEAT_NUM - ONE_REPEAT_SORT_NUM));
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Extract(tempTensor, expandedRowIdx, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
if (rowIdxType_ == SCATTER or quantMode_ == 1) {
|
||||
Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), expandDstToSrcRowLocalFp32, RoundMode::CAST_RINT,
|
||||
totalLength_);
|
||||
}
|
||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
|
||||
expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
|
||||
expandDstToSrcRowQueue_.EnQue<uint32_t>(expandDstToSrcRowLocal);
|
||||
sortDataCopyInQueue_.FreeTensor(inLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::CopyOutDefaultGatherIdx()
|
||||
{
|
||||
LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<int32_t>();
|
||||
Duplicate(expandedRowIdx, static_cast<int32_t>(-1), static_cast<int32_t>(totalLength_));
|
||||
SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0, 0,
|
||||
0};
|
||||
DataCopyPad(expandedRowIdxGm_, expandedRowIdx, copyParams);
|
||||
expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::CopyOutDefaultTokenCountOrCumsum()
|
||||
{
|
||||
LocalTensor<int64_t> expertTokensOut = expertTokensCopyOutQueue_.AllocTensor<int64_t>();
|
||||
Duplicate(expertTokensOut.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
|
||||
static_cast<int32_t>(expertCountElements_ * EXERPT_TOKENS_KEY_VALUE));
|
||||
SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(expertCountElements_ * sizeof(int64_t)), 0, 0, 0};
|
||||
DataCopyPad(expertTokensCountOrCumsumGm_, expertTokensOut, copyParams);
|
||||
expertTokensCopyOutQueue_.FreeTensor(expertTokensOut);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::CopyOutIdx()
|
||||
{
|
||||
LocalTensor<int32_t> expandedExpertIdx = expandedExpertIdxCopyOutQueue_.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.DeQue<int32_t>();
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(actual_idx_num_ * sizeof(int32_t)),
|
||||
0, 0, 0};
|
||||
DataCopyPad(expandedRowIdxGm_, expandDstToSrcRowLocal, copyParams);
|
||||
} else if (ep_) {
|
||||
LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<int32_t>();
|
||||
Duplicate(expandedRowIdx, static_cast<int32_t>(-1), static_cast<int32_t>(totalLength_));
|
||||
SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
|
||||
for (int64_t i = 0; i < actual_idx_num_; i++) {
|
||||
int32_t curExpertId = expandedExpertIdx.GetValue(i);
|
||||
if (curExpertId < expertStart_ || curExpertId >= expertEnd_) {
|
||||
break;
|
||||
}
|
||||
int64_t outIndices = expandDstToSrcRowLocal.GetValue(i);
|
||||
expandedRowIdx.SetValue(outIndices, i);
|
||||
}
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0,
|
||||
0, 0};
|
||||
DataCopyPad(expandedRowIdxGm_, expandedRowIdx, copyParams);
|
||||
expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
|
||||
} else {
|
||||
LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.DeQue<int32_t>();
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0,
|
||||
0, 0};
|
||||
DataCopyPad(expandedRowIdxGm_, expandedRowIdx, copyParams);
|
||||
expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx);
|
||||
}
|
||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdx);
|
||||
expandDstToSrcRowQueue_.EnQue<int32_t>(expandDstToSrcRowLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::ComputeExpertTokenCountOrCumsum()
|
||||
{
|
||||
// compute
|
||||
LocalTensor<int32_t> expandedExpertIdx = expandedExpertIdxCopyOutQueue_.DeQue<int32_t>();
|
||||
LocalTensor<int64_t> expertTokensOut = expertTokensCopyOutQueue_.AllocTensor<int64_t>();
|
||||
Duplicate(expertTokensOut.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
|
||||
static_cast<int32_t>(expertCountElements_ * EXERPT_TOKENS_KEY_VALUE));
|
||||
SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
|
||||
int64_t i = 0;
|
||||
int32_t lastExpertId = expandedExpertIdx.GetValue(0);
|
||||
int32_t lastLastId = lastExpertId;
|
||||
int64_t tokenCount = 0;
|
||||
int64_t lastIndex = 0;
|
||||
int64_t Offset = 0;
|
||||
for (i = 1; i < actual_idx_num_; i++) {
|
||||
if ((lastExpertId >= expertEnd_) || (lastExpertId < expertStart_)) {
|
||||
break;
|
||||
}
|
||||
int32_t curExpertId = expandedExpertIdx.GetValue(i);
|
||||
if (curExpertId != lastExpertId || curExpertId >= expertEnd_) {
|
||||
int64_t expertOffset = lastExpertId - expertStart_;
|
||||
if (expertTokensNumType_ == EXERPT_TOKENS_KEY_VALUE) {
|
||||
expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE, lastExpertId);
|
||||
expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE + 1, i - lastIndex);
|
||||
Offset += 1;
|
||||
} else if (expertTokensNumType_ == EXERPT_TOKENS_COUNT) {
|
||||
expertTokensOut.SetValue(expertOffset, i - lastIndex);
|
||||
} else {
|
||||
for (int64_t j = lastLastId; j < lastExpertId; j++) {
|
||||
expertTokensOut.SetValue(j - expertStart_, tokenCount);
|
||||
}
|
||||
tokenCount += i - lastIndex;
|
||||
expertTokensOut.SetValue(expertOffset, tokenCount);
|
||||
}
|
||||
lastIndex = i;
|
||||
lastLastId = lastExpertId;
|
||||
lastExpertId = curExpertId;
|
||||
}
|
||||
}
|
||||
if ((i == actual_idx_num_) && ((lastExpertId >= expertStart_) && (lastExpertId < expertEnd_))) {
|
||||
int64_t expertOffset = lastExpertId - expertStart_;
|
||||
if (expertTokensNumType_ == EXERPT_TOKENS_KEY_VALUE) {
|
||||
expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE, lastExpertId);
|
||||
expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE + 1, i - lastIndex);
|
||||
} else if (expertTokensNumType_ == EXERPT_TOKENS_COUNT) {
|
||||
expertTokensOut.SetValue(expertOffset, i - lastIndex);
|
||||
} else {
|
||||
for (int64_t j = lastLastId; j < lastExpertId; j++) {
|
||||
expertTokensOut.SetValue(j - expertStart_, tokenCount);
|
||||
}
|
||||
tokenCount += i - lastIndex;
|
||||
expertTokensOut.SetValue(expertOffset, tokenCount);
|
||||
for (int64_t j = lastExpertId; j < expertEnd_; j++) {
|
||||
expertTokensOut.SetValue(j - expertStart_, tokenCount);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (expertTokensNumType_ == EXERPT_TOKENS_CUMSUM) {
|
||||
for (int64_t j = lastLastId; j < expertEnd_; j++) {
|
||||
expertTokensOut.SetValue(j - expertStart_, tokenCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdx);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(expertCountElements_ * sizeof(int64_t)), 0, 0, 0};
|
||||
DataCopyPad(expertTokensCountOrCumsumGm_, expertTokensOut, copyParams);
|
||||
SetWaitFlag<HardEvent::MTE3_V>(HardEvent::MTE3_V);
|
||||
expertTokensCopyOutQueue_.FreeTensor(expertTokensOut);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadBase<T>::TilingInKernel()
|
||||
{
|
||||
int64_t coreNum = needCoreNum_;
|
||||
perCoreIndicesElements_ = Ceil(actual_idx_num_, coreNum);
|
||||
needCoreNum_ = Ceil(actual_idx_num_, perCoreIndicesElements_);
|
||||
int64_t lastCoreIndicesElements = actual_idx_num_ - (needCoreNum_ - 1) * perCoreIndicesElements_;
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
coreIndicesElements_ = lastCoreIndicesElements;
|
||||
} else {
|
||||
coreIndicesElements_ = perCoreIndicesElements_;
|
||||
}
|
||||
curIndexStart_ = this->blockIdx_ * this->perCoreIndicesElements_;
|
||||
startXRow_ = curIndexStart_ / this->k_;
|
||||
endXRow_ = (curIndexStart_ + this->coreIndicesElements_ - 1) / this->k_;
|
||||
}
|
||||
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_FULL_LOAD_BASE_H
|
||||
@@ -0,0 +1,300 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_full_load_dynamic_quant.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_FULL_LOAD_DYNAMIC_QUANT_H
|
||||
#define MOE_CUSTOM_FULL_LOAD_DYNAMIC_QUANT_H
|
||||
|
||||
#include "moe_custom_full_load_base.h"
|
||||
#include "moe_custom_common.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
|
||||
class MoeCustomFullLoadDynamicQuant : public MoeCustomFullLoadBase<T> {
|
||||
public:
|
||||
__aicore__ inline MoeCustomFullLoadDynamicQuant(){};
|
||||
__aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyOutXDynamicQuantFromGather();
|
||||
__aicore__ inline void CopyOutXDynamicQuantFromScatter();
|
||||
__aicore__ inline void FreeLocalTensor();
|
||||
__aicore__ inline void ComputeQuant(LocalTensor<float> &smoothLocal);
|
||||
|
||||
private:
|
||||
TQue<QuePosition::VECIN, 1> xCopyInQueue_;
|
||||
TQue<QuePosition::VECIN, 1> smoothInQueue_;
|
||||
TBuf<TPosition::VECCALC> tmpBuff_;
|
||||
TQue<QuePosition::VECOUT, 1> inputXOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> scaleOutQueue_;
|
||||
|
||||
GlobalTensor<T> xGm_;
|
||||
GlobalTensor<int8_t> expandedXGm_;
|
||||
GlobalTensor<float> quantSmoothGm_;
|
||||
GlobalTensor<float> expandedScaleGm_;
|
||||
|
||||
int64_t colsAlign_ = 0;
|
||||
};
|
||||
|
||||
template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
|
||||
__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::Init(
|
||||
GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
MoeCustomFullLoadBase<T>::Init(expertIdx, expandedRowIdx, expertTokensCountOrCumsum, workspace, tilingData, tPipe);
|
||||
|
||||
xGm_.SetGlobalBuffer((__gm__ T *)x);
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
|
||||
quantSmoothGm_.SetGlobalBuffer((__gm__ float *)scale);
|
||||
expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
|
||||
this->colsAlign_ = Align(this->cols_, sizeof(T));
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
this->pipe_->InitBuffer(xCopyInQueue_, 1, AlignBytes(this->cols_, sizeof(float)));
|
||||
} else {
|
||||
this->pipe_->InitBuffer(xCopyInQueue_, 1, 2 * AlignBytes(this->cols_, sizeof(T)));
|
||||
}
|
||||
this->pipe_->InitBuffer(inputXOutQueue_, 1, AlignBytes(this->cols_, sizeof(int8_t)));
|
||||
this->pipe_->InitBuffer(smoothInQueue_, 1, AlignBytes(this->cols_, sizeof(float)));
|
||||
this->pipe_->InitBuffer(tmpBuff_, AlignBytes(this->cols_, sizeof(float)));
|
||||
this->pipe_->InitBuffer(scaleOutQueue_, 1, BLOCK_BYTES + BLOCK_BYTES);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
|
||||
__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::Process()
|
||||
{
|
||||
if (this->blockIdx_ < this->needCoreNum_) {
|
||||
this->CopyIn();
|
||||
this->Compute();
|
||||
|
||||
// vaild expert equal zero
|
||||
if (this->needCoreNum_ < 1) {
|
||||
if (this->blockIdx_ == 0) {
|
||||
if (this->rowIdxType_ == GATHER) {
|
||||
this->CopyOutDefaultGatherIdx();
|
||||
}
|
||||
if (this->expertTokensNumFlag_ == 1) {
|
||||
this->CopyOutDefaultTokenCountOrCumsum();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (this->blockIdx_ == 0) {
|
||||
this->CopyOutIdx();
|
||||
}
|
||||
|
||||
if (this->blockIdx_ == this->needCoreNum_ - 1 && this->expertTokensNumFlag_ == 1) {
|
||||
this->ComputeExpertTokenCountOrCumsum();
|
||||
}
|
||||
|
||||
if (this->blockIdx_ < this->needCoreNum_) {
|
||||
if constexpr (!COPYOUTTYPE && SMOOTHTYPE != SCALE_EH) {
|
||||
CopyOutXDynamicQuantFromGather();
|
||||
} else {
|
||||
CopyOutXDynamicQuantFromScatter();
|
||||
}
|
||||
}
|
||||
|
||||
FreeLocalTensor();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
|
||||
__aicore__ inline void
|
||||
MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::ComputeQuant(LocalTensor<float> &smoothLocal)
|
||||
{
|
||||
LocalTensor<float> tempLocal = tmpBuff_.Get<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
|
||||
LocalTensor<float> dynamicQuantLocal = scaleOutQueue_.AllocTensor<float>();
|
||||
LocalTensor<float> inLocal = xCopyInQueue_.DeQue<float>();
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value && !IsSameType<T, int8_t>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign_], RoundMode::CAST_NONE, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if constexpr (SMOOTHTYPE != NO_SCALE) {
|
||||
Mul(inLocal, inLocal, smoothLocal, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
float maxValue = dynamicQuantLocal.GetValue(0) / MAX_INT8;
|
||||
|
||||
Duplicate<float>(dynamicQuantLocal, maxValue, INT32_ONE_BLOCK_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Duplicate<float>(tempLocal, maxValue, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<int32_t> intLocal = tempLocal.ReinterpretCast<int32_t>();
|
||||
Cast(intLocal, tempLocal, RoundMode::CAST_RINT, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
Cast(intLocal.ReinterpretCast<half>(), intLocal, RoundMode::CAST_ROUND, this->cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, intLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, this->cols_);
|
||||
|
||||
inputXOutQueue_.EnQue<int8_t>(outLocal);
|
||||
scaleOutQueue_.EnQue<float>(dynamicQuantLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
|
||||
__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::CopyOutXDynamicQuantFromScatter()
|
||||
{
|
||||
LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
|
||||
DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
|
||||
;
|
||||
|
||||
if constexpr (SMOOTHTYPE == SCALE_1H) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm_, smoothCopyParams, {false, 0, 0, 0});
|
||||
smoothInQueue_.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue_.DeQue<float>();
|
||||
}
|
||||
|
||||
int64_t dstIndexStart = this->curIndexStart_;
|
||||
int64_t dstIndexEnd = dstIndexStart + this->coreIndicesElements_ - 1;
|
||||
int32_t lastExpertIdx = -1;
|
||||
|
||||
for (int64_t dstIndex = dstIndexStart; dstIndex <= dstIndexEnd; dstIndex++) {
|
||||
if (this->dropPadMode_ == DROPLESS_MODE && dstIndex >= this->activeNum_) {
|
||||
break;
|
||||
}
|
||||
int32_t srcIdx = sortedRowIdx.GetValue(dstIndex);
|
||||
int32_t expertIdx = expandedExpertIdx.GetValue(dstIndex);
|
||||
if (expertIdx < this->expertStart_ || expertIdx >= this->expertEnd_) {
|
||||
break;
|
||||
}
|
||||
expertIdx = expertIdx - this->expertStart_;
|
||||
LocalTensor<T> xLocal = this->xCopyInQueue_.template AllocTensor<T>();
|
||||
// copy in single x
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
DataCopyPad(xLocal, this->xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(xLocal[colsAlign_], this->xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams,
|
||||
{false, 0, 0, 0});
|
||||
}
|
||||
xCopyInQueue_.EnQue<T>(xLocal);
|
||||
|
||||
// copyin dynamic scale
|
||||
if constexpr (SMOOTHTYPE == SCALE_EH) {
|
||||
if (expertIdx != lastExpertIdx) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm_[expertIdx * this->cols_], smoothCopyParams, {false, 0, 0, 0});
|
||||
smoothInQueue_.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue_.DeQue<float>();
|
||||
lastExpertIdx = expertIdx;
|
||||
}
|
||||
}
|
||||
|
||||
ComputeQuant(smoothLocal);
|
||||
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue_.DeQue<float>();
|
||||
DataCopyPad(expandedScaleGm_[dstIndex], quantScaleLocal, quantScaleParams);
|
||||
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
|
||||
DataCopyPad(this->expandedXGm_[dstIndex * this->cols_], outLocal, intriParams);
|
||||
|
||||
inputXOutQueue_.FreeTensor(outLocal);
|
||||
scaleOutQueue_.FreeTensor(quantScaleLocal);
|
||||
this->xCopyInQueue_.FreeTensor(xLocal);
|
||||
}
|
||||
smoothInQueue_.FreeTensor(smoothLocal);
|
||||
this->expandDstToSrcRowQueue_.EnQue(sortedRowIdx);
|
||||
this->expandedExpertIdxCopyOutQueue_.EnQue(expandedExpertIdx);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
|
||||
__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::CopyOutXDynamicQuantFromGather()
|
||||
{
|
||||
DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
|
||||
int64_t curIndex = this->blockIdx_ * this->perCoreIndicesElements_;
|
||||
int64_t curIndexEnd = curIndex + this->coreIndicesElements_ - 1;
|
||||
|
||||
if constexpr (SMOOTHTYPE == SCALE_1H) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm_, smoothCopyParams, {false, 0, 0, 0});
|
||||
smoothInQueue_.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue_.DeQue<float>();
|
||||
}
|
||||
|
||||
for (int64_t row = this->startXRow_; row <= this->endXRow_; row++) {
|
||||
LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
DataCopyPad(xLocal, this->xGm_[row * this->cols_], dataXCopyParams, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(xLocal[colsAlign_], this->xGm_[row * this->cols_], dataXCopyParams, {false, 0, 0, 0});
|
||||
}
|
||||
xCopyInQueue_.EnQue<T>(xLocal);
|
||||
ComputeQuant(smoothLocal);
|
||||
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue_.DeQue<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
|
||||
while (curIndex <= curIndexEnd && curIndex / this->k_ == row) {
|
||||
int32_t outIndex = expandedRowIdx.GetValue(curIndex);
|
||||
curIndex++;
|
||||
if (outIndex == -1 || this->dropPadMode_ == DROPLESS_MODE && outIndex >= this->activeNum_) {
|
||||
continue;
|
||||
}
|
||||
DataCopyPad(expandedXGm_[outIndex * this->cols_], outLocal, intriParams);
|
||||
DataCopyPad(expandedScaleGm_[outIndex], quantScaleLocal, quantScaleParams);
|
||||
}
|
||||
|
||||
xCopyInQueue_.FreeTensor(xLocal);
|
||||
inputXOutQueue_.FreeTensor(outLocal);
|
||||
scaleOutQueue_.FreeTensor(quantScaleLocal);
|
||||
}
|
||||
|
||||
smoothInQueue_.FreeTensor(smoothLocal);
|
||||
this->expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
|
||||
__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::FreeLocalTensor()
|
||||
{
|
||||
if constexpr (!COPYOUTTYPE) {
|
||||
LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
this->expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
|
||||
}
|
||||
LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
this->expandDstToSrcRowQueue_.FreeTensor(sortedRowIdx);
|
||||
this->expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdx);
|
||||
}
|
||||
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_FULL_LOAD_DYNAMIC_QUANT_H
|
||||
@@ -0,0 +1,229 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_static_quant_full_load.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_FULL_LOAD_STATIC_QUANT_H
|
||||
#define MOE_CUSTOM_FULL_LOAD_STATIC_QUANT_H
|
||||
|
||||
#include "moe_custom_full_load_base.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
template <typename T>
|
||||
class MoeCustomFullLoadStaticQuant : public MoeCustomFullLoadBase<T> {
|
||||
public:
|
||||
__aicore__ inline MoeCustomFullLoadStaticQuant(){};
|
||||
__aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedX,
|
||||
GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyOutXStaticQuant();
|
||||
__aicore__ inline void FreeLocalTensor();
|
||||
__aicore__ inline void ComputeQuant(int64_t xLocalLength);
|
||||
|
||||
private:
|
||||
TQue<QuePosition::VECIN, 1> xCopyInQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> floatQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> halfQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> inputXOutQueue_;
|
||||
|
||||
GlobalTensor<T> xGm_;
|
||||
GlobalTensor<int8_t> expandedXGm_;
|
||||
GlobalTensor<float> scaleGm_;
|
||||
GlobalTensor<float> offsetGm_;
|
||||
|
||||
float scale_;
|
||||
float offset_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset,
|
||||
GM_ADDR expandedX, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expertTokensCountOrCumsum, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
MoeCustomFullLoadBase<T>::Init(expertIdx, expandedRowIdx, expertTokensCountOrCumsum, workspace, tilingData, tPipe);
|
||||
|
||||
xGm_.SetGlobalBuffer((__gm__ T *)x);
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
|
||||
scaleGm_.SetGlobalBuffer((__gm__ float *)scale, 1);
|
||||
offsetGm_.SetGlobalBuffer((__gm__ float *)offset, 1);
|
||||
this->scale_ = scaleGm_.GetValue(0);
|
||||
this->offset_ = offsetGm_.GetValue(0);
|
||||
SetWaitFlag<HardEvent::S_V>(HardEvent::S_V);
|
||||
int64_t curIndexStart = this->blockIdx_ * this->perCoreIndicesElements_;
|
||||
int64_t rowLength = 0;
|
||||
if (this->ep_) {
|
||||
rowLength = 1;
|
||||
} else {
|
||||
rowLength = (curIndexStart + this->coreIndicesElements_ - 1) / this->k_ - curIndexStart / this->k_ + 1;
|
||||
}
|
||||
int64_t xAlignedCount = Align(this->cols_, sizeof(int8_t));
|
||||
this->pipe_->InitBuffer(xCopyInQueue_, this->bufferNum_, xAlignedCount * sizeof(T) * rowLength);
|
||||
this->pipe_->InitBuffer(inputXOutQueue_, 1, xAlignedCount * sizeof(int8_t) * rowLength);
|
||||
this->pipe_->InitBuffer(floatQueue_, 1, xAlignedCount * sizeof(float) * rowLength);
|
||||
this->pipe_->InitBuffer(halfQueue_, 1, xAlignedCount * sizeof(half) * rowLength);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::Process()
|
||||
{
|
||||
if (this->blockIdx_ < this->needCoreNum_) {
|
||||
this->CopyIn();
|
||||
this->Compute();
|
||||
|
||||
// vaild expert equal zero
|
||||
if (this->needCoreNum_ < 1) {
|
||||
if (this->blockIdx_ == 0) {
|
||||
if (this->rowIdxType_ == GATHER) {
|
||||
this->CopyOutDefaultGatherIdx();
|
||||
}
|
||||
if (this->expertTokensNumFlag_ == 1) {
|
||||
this->CopyOutDefaultTokenCountOrCumsum();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (this->blockIdx_ == 0) {
|
||||
this->CopyOutIdx();
|
||||
}
|
||||
if (this->blockIdx_ == this->needCoreNum_ - 1 && this->expertTokensNumFlag_ == 1) {
|
||||
this->ComputeExpertTokenCountOrCumsum();
|
||||
}
|
||||
if (this->blockIdx_ < this->needCoreNum_) {
|
||||
CopyOutXStaticQuant();
|
||||
}
|
||||
FreeLocalTensor();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::ComputeQuant(int64_t xLocalLength)
|
||||
{
|
||||
LocalTensor<float> floatLocal;
|
||||
LocalTensor<T> inLocal;
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
|
||||
LocalTensor<half> halfLocal = halfQueue_.AllocTensor<half>();
|
||||
uint64_t elements = Align(this->cols_, sizeof(int8_t)) * xLocalLength;
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
floatLocal = this->xCopyInQueue_.template DeQue<float>();
|
||||
} else {
|
||||
inLocal = this->xCopyInQueue_.template DeQue<T>();
|
||||
floatLocal = floatQueue_.AllocTensor<float>();
|
||||
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
Muls(floatLocal, floatLocal, this->scale_, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Adds(floatLocal, floatLocal, this->offset_, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
||||
Cast(intLocal, floatLocal, RoundMode::CAST_RINT, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
Cast(halfLocal, intLocal, RoundMode::CAST_ROUND, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, halfLocal, RoundMode::CAST_TRUNC, elements);
|
||||
inputXOutQueue_.EnQue(outLocal);
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
this->xCopyInQueue_.FreeTensor(floatLocal);
|
||||
} else {
|
||||
this->xCopyInQueue_.FreeTensor(inLocal);
|
||||
floatQueue_.FreeTensor(floatLocal);
|
||||
}
|
||||
|
||||
halfQueue_.FreeTensor(halfLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::CopyOutXStaticQuant()
|
||||
{
|
||||
int64_t curIndex = this->curIndexStart_;
|
||||
int64_t curIndexEnd = curIndex + this->coreIndicesElements_ - 1;
|
||||
|
||||
if (this->ep_) {
|
||||
LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
|
||||
DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
|
||||
for (int64_t dstIndex = curIndex; dstIndex <= curIndexEnd; dstIndex++) {
|
||||
if (this->dropPadMode_ == DROPLESS_MODE && dstIndex >= this->activeNum_) {
|
||||
break;
|
||||
}
|
||||
int32_t srcIdx = sortedRowIdx.GetValue(dstIndex);
|
||||
int32_t expertIdx = expandedExpertIdx.GetValue(dstIndex);
|
||||
if (expertIdx < this->expertStart_ || expertIdx >= this->expertEnd_) {
|
||||
break;
|
||||
}
|
||||
LocalTensor<T> inLocal = this->xCopyInQueue_.template AllocTensor<T>();
|
||||
// copyinx
|
||||
DataCopyPad(inLocal, this->xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
|
||||
this->xCopyInQueue_.template EnQue<T>(inLocal);
|
||||
ComputeQuant(1);
|
||||
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
|
||||
DataCopyPad(this->expandedXGm_[dstIndex * this->cols_], outLocal, intriParams);
|
||||
inputXOutQueue_.FreeTensor(outLocal);
|
||||
}
|
||||
this->expandDstToSrcRowQueue_.EnQue(sortedRowIdx);
|
||||
this->expandedExpertIdxCopyOutQueue_.EnQue(expandedExpertIdx);
|
||||
} else {
|
||||
LocalTensor<T> xLocal = this->xCopyInQueue_.template AllocTensor<T>();
|
||||
LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
int64_t inFactor = Align(this->cols_, sizeof(int8_t));
|
||||
uint32_t dstStride = (inFactor * sizeof(T) - AlignBytes(this->cols_, sizeof(T))) / BLOCK_BYTES;
|
||||
DataCopyExtParams dataXCopyParams{static_cast<uint16_t>(this->endXRow_ - this->startXRow_ + 1),
|
||||
static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, dstStride, 0};
|
||||
DataCopyPad(xLocal, this->xGm_[this->startXRow_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
|
||||
this->xCopyInQueue_.EnQue(xLocal);
|
||||
SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
|
||||
ComputeQuant(this->endXRow_ - this->startXRow_ + 1);
|
||||
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
|
||||
int64_t k = 0;
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
for (int64_t i = this->startXRow_; i <= this->endXRow_; i++) {
|
||||
for (; k < this->coreIndicesElements_ && curIndex / this->k_ == i; curIndex++, k++) {
|
||||
int32_t outIndex = expandedRowIdx.GetValue(curIndex);
|
||||
if (outIndex < this->activeNum_) {
|
||||
DataCopyPad(this->expandedXGm_[outIndex * this->cols_], outLocal[(i - this->startXRow_) * inFactor],
|
||||
intriParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
inputXOutQueue_.FreeTensor(outLocal);
|
||||
this->expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::FreeLocalTensor()
|
||||
{
|
||||
if (!this->ep_) {
|
||||
LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
this->expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
|
||||
}
|
||||
LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
this->expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdx);
|
||||
LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
|
||||
this->expandDstToSrcRowQueue_.FreeTensor(sortedRowIdx);
|
||||
}
|
||||
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_FULL_LOAD_STATIC_QUANT_H
|
||||
@@ -0,0 +1,224 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_full_load_unquantized.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_FULL_LOAD_UNQUANTIZED_H
|
||||
#define MOE_CUSTOM_FULL_LOAD_UNQUANTIZED_H
|
||||
|
||||
#include "moe_custom_full_load_base.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
template <typename T>
|
||||
class MoeCustomFullLoadUnquantized : public MoeCustomFullLoadBase<T> {
|
||||
public:
|
||||
__aicore__ inline MoeCustomFullLoadUnquantized(){};
|
||||
__aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
protected:
|
||||
__aicore__ inline void FreeLocalTensor();
|
||||
__aicore__ inline void GatherOutX();
|
||||
__aicore__ inline void CopyOutScale();
|
||||
|
||||
protected:
|
||||
TQue<QuePosition::VECIN, 1> xCopyInQueue_;
|
||||
TQue<QuePosition::VECIN, 1> scaleCopyInQueue_;
|
||||
|
||||
GlobalTensor<T> xGm_;
|
||||
GlobalTensor<float> scaleGm_;
|
||||
GlobalTensor<T> expandedXGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<float> expandedScaleGm_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadUnquantized<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX,
|
||||
GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
|
||||
GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
MoeCustomFullLoadBase<T>::Init(expertIdx, expandedRowIdx, expertTokensCountOrCumsum, workspace, tilingData, tPipe);
|
||||
xGm_.SetGlobalBuffer((__gm__ T *)x);
|
||||
if (this->isInputScale_) {
|
||||
scaleGm_.SetGlobalBuffer((__gm__ float *)scale);
|
||||
expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
|
||||
}
|
||||
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX);
|
||||
int64_t buffSize = this->sortNum_ * sizeof(int32_t);
|
||||
int64_t row_length =
|
||||
(this->curIndexStart_ + this->coreIndicesElements_ - 1) / this->k_ - this->curIndexStart_ / this->k_ + 1;
|
||||
|
||||
if (this->ep_) {
|
||||
this->pipe_->InitBuffer(xCopyInQueue_, this->bufferNum_, AlignBytes(this->cols_, sizeof(T)));
|
||||
} else {
|
||||
this->pipe_->InitBuffer(xCopyInQueue_, this->bufferNum_, AlignBytes(this->cols_, sizeof(T)) * row_length);
|
||||
}
|
||||
this->pipe_->InitBuffer(scaleCopyInQueue_, 1, AlignBytes(1, sizeof(float)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadUnquantized<T>::Process()
|
||||
{
|
||||
if (this->blockIdx_ < this->needCoreNum_) {
|
||||
this->CopyIn();
|
||||
this->Compute();
|
||||
|
||||
// vaild expert equal zero
|
||||
if (this->needCoreNum_ < 1) {
|
||||
if (this->blockIdx_ == 0) {
|
||||
if (this->rowIdxType_ == GATHER) {
|
||||
this->CopyOutDefaultGatherIdx();
|
||||
}
|
||||
if (this->expertTokensNumFlag_ == 1) {
|
||||
this->CopyOutDefaultTokenCountOrCumsum();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (this->blockIdx_ == 0) {
|
||||
this->CopyOutIdx();
|
||||
}
|
||||
|
||||
if (this->blockIdx_ == this->needCoreNum_ - 1 && this->expertTokensNumFlag_ == 1) {
|
||||
this->ComputeExpertTokenCountOrCumsum();
|
||||
}
|
||||
|
||||
if (this->blockIdx_ < this->needCoreNum_) {
|
||||
this->GatherOutX();
|
||||
if (this->isInputScale_) {
|
||||
this->CopyOutScale();
|
||||
}
|
||||
}
|
||||
|
||||
this->FreeLocalTensor();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadUnquantized<T>::GatherOutX()
|
||||
{
|
||||
if (this->ep_) {
|
||||
LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expandDstToSrcRowLocal = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
|
||||
int64_t startRowIdx = this->blockIdx_ * this->perCoreIndicesElements_;
|
||||
int64_t endRowIdx = startRowIdx + this->coreIndicesElements_;
|
||||
LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> padParams{false, 0, 0, 0};
|
||||
for (int64_t i = startRowIdx; i < endRowIdx && i < this->activeNum_; i++) {
|
||||
int32_t curExpertId = expandedExpertIdx.GetValue(i);
|
||||
if (curExpertId < this->expertStart_ || curExpertId >= this->expertEnd_) {
|
||||
break;
|
||||
}
|
||||
int64_t rowIdx = expandDstToSrcRowLocal.GetValue(i);
|
||||
int64_t srcOffset = rowIdx / this->k_ * this->cols_;
|
||||
int64_t dstOffset = i * this->cols_;
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
DataCopyPad(xLocal, xGm_[srcOffset], copyParams, padParams);
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
DataCopyPad(expandedXGm_[dstOffset], xLocal, copyParams);
|
||||
}
|
||||
xCopyInQueue_.FreeTensor(xLocal);
|
||||
this->expandedExpertIdxCopyOutQueue_.template EnQue<int32_t>(expandedExpertIdx);
|
||||
this->expandDstToSrcRowQueue_.template EnQue<int32_t>(expandDstToSrcRowLocal);
|
||||
} else {
|
||||
LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
|
||||
DataCopyExtParams dataXCopyParams{static_cast<uint16_t>(this->endXRow_ - this->startXRow_ + 1),
|
||||
static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> dataXCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(xLocal, xGm_[this->startXRow_ * this->cols_], dataXCopyParams, dataXCopyPadParams);
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
int64_t inFactor = Align(this->cols_, sizeof(T));
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
|
||||
LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
int64_t curIndexStart = this->curIndexStart_;
|
||||
int64_t k = 0;
|
||||
for (int64_t i = this->startXRow_; i <= this->endXRow_; i++) {
|
||||
for (; k < this->coreIndicesElements_ && curIndexStart / this->k_ == i; curIndexStart++, k++) {
|
||||
int32_t outIndex = expandedRowIdx.GetValue(curIndexStart);
|
||||
if (outIndex < this->activeNum_) {
|
||||
DataCopyPad(expandedXGm_[outIndex * this->cols_], xLocal[(i - this->startXRow_) * inFactor],
|
||||
copyParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
xCopyInQueue_.FreeTensor(xLocal);
|
||||
this->expandedRowIdxCopyOutQueue_.template EnQue<int32_t>(expandedRowIdx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadUnquantized<T>::FreeLocalTensor()
|
||||
{
|
||||
LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expandDstToSrcRowLocal = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
|
||||
this->expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdx);
|
||||
this->expandDstToSrcRowQueue_.FreeTensor(expandDstToSrcRowLocal);
|
||||
if (!this->ep_) {
|
||||
LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
this->expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeCustomFullLoadUnquantized<T>::CopyOutScale()
|
||||
{
|
||||
LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
DataCopyPadExtParams<float> padParams{false, 0, 0, 0};
|
||||
if (this->ep_) {
|
||||
LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expandDstToSrcRowLocal = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
|
||||
int64_t startRowIdx = this->blockIdx_ * this->perCoreIndicesElements_;
|
||||
int64_t endRowIdx = startRowIdx + this->coreIndicesElements_;
|
||||
for (int64_t i = startRowIdx; i < endRowIdx && i < this->activeNum_; i++) {
|
||||
int32_t curExpertId = expandedExpertIdx.GetValue(i);
|
||||
if (curExpertId < this->expertStart_ || curExpertId >= this->expertEnd_) {
|
||||
break;
|
||||
}
|
||||
int64_t rowIdx = expandDstToSrcRowLocal.GetValue(i);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
DataCopyPad(scaleLocal, scaleGm_[rowIdx / this->k_], copyParams, padParams);
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
DataCopyPad(expandedScaleGm_[i], scaleLocal, copyParams);
|
||||
}
|
||||
this->expandedExpertIdxCopyOutQueue_.template EnQue<int32_t>(expandedExpertIdx);
|
||||
this->expandDstToSrcRowQueue_.template EnQue<int32_t>(expandDstToSrcRowLocal);
|
||||
} else {
|
||||
LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
|
||||
int64_t curIndexStart = this->curIndexStart_;
|
||||
int64_t k = 0;
|
||||
for (int64_t i = this->startXRow_; i <= this->endXRow_; i++) {
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
DataCopyPad(scaleLocal, scaleGm_[i], copyParams, padParams);
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
for (; k < this->coreIndicesElements_ && curIndexStart / this->k_ == i; curIndexStart++, k++) {
|
||||
int32_t outIndex = expandedRowIdx.GetValue(curIndexStart);
|
||||
if (outIndex < this->activeNum_) {
|
||||
DataCopyPad(expandedScaleGm_[outIndex], scaleLocal, copyParams);
|
||||
}
|
||||
}
|
||||
}
|
||||
this->expandedRowIdxCopyOutQueue_.template EnQue<int32_t>(expandedRowIdx);
|
||||
}
|
||||
scaleCopyInQueue_.FreeTensor(scaleLocal);
|
||||
}
|
||||
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_FULL_LOAD_UNQUANTIZED_H
|
||||
@@ -0,0 +1,238 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_gather_droppad_static_quant.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_GATHER_DROPPAD_STATIC_QUANT_H
|
||||
#define MOE_CUSTOM_GATHER_DROPPAD_STATIC_QUANT_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
constexpr int64_t GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM = 2;
|
||||
|
||||
template <typename T>
|
||||
class MoeGatherDroppadQuant {
|
||||
public:
|
||||
__aicore__ inline MoeGatherDroppadQuant(){};
|
||||
__aicore__ inline void Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expandedX, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
|
||||
TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyExpertIn(int64_t progress);
|
||||
__aicore__ inline void Compute();
|
||||
__aicore__ inline void CopyXIn(int64_t xSrcOffset, int64_t curLoopCols);
|
||||
__aicore__ inline void CopyOut(int64_t progress);
|
||||
|
||||
private:
|
||||
TPipe *pipe_;
|
||||
TQue<QuePosition::VECIN, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM> inputXCopyInQueue_;
|
||||
TQue<QuePosition::VECIN, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM> expandRowIdxCopyInQueue_;
|
||||
TQue<QuePosition::VECOUT, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM> inputXCopyOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> floatQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> halfQueue_;
|
||||
|
||||
GlobalTensor<T> inputXGm_;
|
||||
GlobalTensor<int8_t> expandedXGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<float> scaleGm_;
|
||||
GlobalTensor<float> offsetGm_;
|
||||
|
||||
const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
|
||||
|
||||
int64_t needCoreNum_;
|
||||
int64_t blockIdx_;
|
||||
int64_t cols_;
|
||||
int64_t n_;
|
||||
int64_t k_;
|
||||
int64_t currentLoopRows_;
|
||||
int64_t coreRows_;
|
||||
int64_t perLoopRows_;
|
||||
int64_t lastLoopRows_;
|
||||
int64_t rowLoops_;
|
||||
int64_t colsTileLength_;
|
||||
int64_t perLoopCols_;
|
||||
int64_t lastLoopCols_;
|
||||
int64_t colLoops_;
|
||||
float scale_;
|
||||
float offset_;
|
||||
|
||||
int64_t indicesOffset_;
|
||||
int64_t inputOffset_;
|
||||
int64_t outOffset_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherDroppadQuant<T>::CopyExpertIn(int64_t progress)
|
||||
{
|
||||
indicesOffset_ = progress * perLoopRows_;
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
|
||||
expandRowIdxCopyInQueue_.EnQue<int32_t>(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherDroppadQuant<T>::CopyXIn(int64_t xSrcOffset, int64_t curLoopCols)
|
||||
{
|
||||
LocalTensor<T> inLocal = inputXCopyInQueue_.AllocTensor<T>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal, inputXGm_[xSrcOffset], dataCopyParams, dataCopyPadParams);
|
||||
inputXCopyInQueue_.EnQue(inLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherDroppadQuant<T>::Compute()
|
||||
{
|
||||
LocalTensor<float> floatLocal;
|
||||
LocalTensor<T> inLocal;
|
||||
LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.AllocTensor<int8_t>();
|
||||
LocalTensor<half> halfLocal = halfQueue_.AllocTensor<half>();
|
||||
uint32_t elements = Align(colsTileLength_, sizeof(T));
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
floatLocal = inputXCopyInQueue_.DeQue<float>();
|
||||
} else {
|
||||
inLocal = inputXCopyInQueue_.DeQue<T>();
|
||||
floatLocal = floatQueue_.AllocTensor<float>();
|
||||
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
Muls(floatLocal, floatLocal, scale_, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Adds(floatLocal, floatLocal, offset_, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
||||
Cast(intLocal, floatLocal, RoundMode::CAST_RINT, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(halfLocal, intLocal, RoundMode::CAST_ROUND, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, halfLocal, RoundMode::CAST_TRUNC, elements);
|
||||
inputXCopyOutQueue_.EnQue(outLocal);
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
inputXCopyInQueue_.FreeTensor(floatLocal);
|
||||
} else {
|
||||
inputXCopyInQueue_.FreeTensor(inLocal);
|
||||
floatQueue_.FreeTensor(floatLocal);
|
||||
}
|
||||
halfQueue_.FreeTensor(halfLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherDroppadQuant<T>::CopyOut(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.DeQue<int32_t>();
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
colsTileLength_ = perLoopCols_;
|
||||
for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
|
||||
int64_t initialRow = gatherOutTilingData_->perCoreIndicesElements * blockIdx_ + perLoopRows_ * progress;
|
||||
int64_t curLoopRow = 0;
|
||||
if (colsLoop == colLoops_ - 1) {
|
||||
colsTileLength_ = lastLoopCols_;
|
||||
}
|
||||
int64_t currentLoopStartRow = initialRow / k_;
|
||||
int64_t currentLoopLastRow = (initialRow + currentLoopRows_ - 1) / k_;
|
||||
for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
|
||||
inputOffset_ = row * cols_ + colsLoop * perLoopCols_;
|
||||
// input row position
|
||||
CopyXIn(inputOffset_, colsTileLength_);
|
||||
Compute();
|
||||
LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.DeQue<int8_t>();
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(int8_t)), 0, 0, 0};
|
||||
while (curLoopRow < currentLoopRows_ && initialRow / k_ == row) {
|
||||
int32_t outIndex = indicesLocal.GetValue(curLoopRow);
|
||||
curLoopRow++;
|
||||
initialRow++;
|
||||
if (outIndex == -1) {
|
||||
continue;
|
||||
}
|
||||
outOffset_ = outIndex * cols_ + colsLoop * perLoopCols_;
|
||||
DataCopyPad(expandedXGm_[outOffset_], outLocal, intriParams);
|
||||
}
|
||||
inputXCopyOutQueue_.FreeTensor(outLocal);
|
||||
}
|
||||
}
|
||||
expandRowIdxCopyInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherDroppadQuant<T>::Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset,
|
||||
GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
pipe_ = tPipe;
|
||||
blockIdx_ = GetBlockIdx();
|
||||
gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
|
||||
|
||||
needCoreNum_ = gatherOutTilingData_->needCoreNum;
|
||||
cols_ = tilingData->cols;
|
||||
n_ = tilingData->n;
|
||||
k_ = tilingData->k;
|
||||
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
coreRows_ = gatherOutTilingData_->lastCoreIndicesElements;
|
||||
perLoopRows_ = gatherOutTilingData_->lastCorePerLoopIndicesElements;
|
||||
lastLoopRows_ = gatherOutTilingData_->lastCoreLastLoopIndicesElements;
|
||||
rowLoops_ = gatherOutTilingData_->lastCoreIndicesLoops;
|
||||
} else {
|
||||
coreRows_ = gatherOutTilingData_->perCoreIndicesElements;
|
||||
perLoopRows_ = gatherOutTilingData_->perCorePerLoopIndicesElements;
|
||||
lastLoopRows_ = gatherOutTilingData_->perCoreLastLoopIndicesElements;
|
||||
rowLoops_ = gatherOutTilingData_->perCoreIndicesLoops;
|
||||
}
|
||||
perLoopCols_ = gatherOutTilingData_->perLoopCols;
|
||||
lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
|
||||
colLoops_ = gatherOutTilingData_->colsLoops;
|
||||
|
||||
inputXGm_.SetGlobalBuffer((__gm__ T *)inputX);
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx +
|
||||
blockIdx_ * gatherOutTilingData_->perCoreIndicesElements,
|
||||
Align(coreRows_, sizeof(int32_t)));
|
||||
scaleGm_.SetGlobalBuffer((__gm__ float *)scale, 1);
|
||||
offsetGm_.SetGlobalBuffer((__gm__ float *)offset, 1);
|
||||
scale_ = scaleGm_.GetValue(0);
|
||||
offset_ = offsetGm_.GetValue(0);
|
||||
|
||||
pipe_->InitBuffer(inputXCopyInQueue_, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
|
||||
pipe_->InitBuffer(inputXCopyOutQueue_, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM,
|
||||
AlignBytes(perLoopCols_, sizeof(int8_t)));
|
||||
pipe_->InitBuffer(expandRowIdxCopyInQueue_, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM,
|
||||
AlignBytes(perLoopRows_, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(floatQueue_, 1, AlignBytes(perLoopCols_, sizeof(float)));
|
||||
pipe_->InitBuffer(halfQueue_, 1, AlignBytes(perLoopCols_, sizeof(half)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherDroppadQuant<T>::Process()
|
||||
{
|
||||
if (blockIdx_ < needCoreNum_) {
|
||||
currentLoopRows_ = perLoopRows_;
|
||||
for (int64_t loop = 0; loop < rowLoops_; loop++) {
|
||||
if (loop == rowLoops_ - 1) {
|
||||
currentLoopRows_ = lastLoopRows_;
|
||||
}
|
||||
CopyExpertIn(loop);
|
||||
CopyOut(loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_GATHER_DROPPAD_STATIC_QUANT_H
|
||||
@@ -0,0 +1,602 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_gather_dynamic_quant.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_GATHER_DYNAMIC_QUANT_H
|
||||
#define MOE_CUSTOM_GATHER_DYNAMIC_QUANT_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
constexpr int64_t GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM = 2;
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
class MoeGatherOutDynamicQuant {
|
||||
public:
|
||||
__aicore__ inline MoeGatherOutDynamicQuant(){};
|
||||
__aicore__ inline void Init(GM_ADDR inputX, GM_ADDR quantSmooth, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
|
||||
GM_ADDR expandedScale, GM_ADDR sortedExpertIdx,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyOutXDynamicQuantFromGather(int64_t progress);
|
||||
__aicore__ inline void CopyOutXDynamicQuantFromScatter(int64_t progress);
|
||||
__aicore__ inline void CopyOutXPartialDynamicQuantFromGather(int64_t progress);
|
||||
__aicore__ inline void CopyOutXPartialDynamicQuantFromScatter(int64_t progress);
|
||||
__aicore__ inline void CopyInExpandedExpertIdx(int64_t progress);
|
||||
__aicore__ inline void Compute(LocalTensor<float> &smoothLocal);
|
||||
__aicore__ inline float ComputeMax(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
|
||||
LocalTensor<float> &scaleLocal, int32_t srcIdx, int32_t expertIdx, int64_t j);
|
||||
__aicore__ inline void ComputeScale(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal, float scaleTemp,
|
||||
int64_t dstIndex, int64_t j);
|
||||
|
||||
private:
|
||||
TPipe *pipe_;
|
||||
TQue<QuePosition::VECIN, 1> inputXInQueue_;
|
||||
TQue<QuePosition::VECIN, 1> smoothInQueue_;
|
||||
TQue<QuePosition::VECIN, 1> expandRowIdxInQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> calcQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> inputXOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> scaleOutQueue_;
|
||||
|
||||
GlobalTensor<T> inputXGm_;
|
||||
GlobalTensor<int8_t> expandedXGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<float> quantSmoothGm_;
|
||||
GlobalTensor<float> expandedScaleGm_;
|
||||
GlobalTensor<float> quantTempGm_;
|
||||
GlobalTensor<int32_t> expandedExpertIdxGm_;
|
||||
GlobalTensor<int32_t> expertTotalCountGm_;
|
||||
|
||||
const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
|
||||
|
||||
int64_t needCoreNum_;
|
||||
int64_t blockIdx_;
|
||||
int64_t cols_;
|
||||
int64_t n_;
|
||||
int64_t k_;
|
||||
int64_t totalLength_;
|
||||
int64_t perCoreRow_;
|
||||
int64_t currentLoopRows_;
|
||||
int64_t currentLoopRowsAlign_;
|
||||
int64_t coreRows_;
|
||||
int64_t perLoopRows_;
|
||||
int64_t lastLoopRows_;
|
||||
int64_t rowLoops_;
|
||||
int64_t colsTileLength_;
|
||||
int64_t perLoopCols_;
|
||||
int64_t perLoopColsAlign_;
|
||||
int64_t lastLoopCols_;
|
||||
int64_t colLoops_;
|
||||
int64_t isInputScale_;
|
||||
int64_t expertStart_;
|
||||
|
||||
int64_t indicesOffset_;
|
||||
int64_t rowIdxType_ = 0;
|
||||
int64_t dropPadMode_;
|
||||
int64_t activeNum_;
|
||||
int64_t ep_;
|
||||
int64_t smoothType_;
|
||||
int64_t coreNum_;
|
||||
int64_t expertTotalCount_ = 0;
|
||||
};
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyInExpandedExpertIdx(int64_t progress)
|
||||
{
|
||||
indicesOffset_ = progress * perLoopRows_;
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
|
||||
DataCopyPad(indicesLocal[currentLoopRowsAlign_], expandedExpertIdxGm_[indicesOffset_], dataCopyParams,
|
||||
dataCopyPadParams);
|
||||
expandRowIdxInQueue_.EnQue<int32_t>(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::Compute(LocalTensor<float> &smoothLocal)
|
||||
{
|
||||
LocalTensor<float> inLocal = inputXInQueue_.DeQue<float>();
|
||||
|
||||
LocalTensor<float> tempLocal = calcQueue_.AllocTensor<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
|
||||
LocalTensor<float> scaleLocal = scaleOutQueue_.AllocTensor<float>();
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign_], RoundMode::CAST_NONE, cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (isInputScale_) {
|
||||
Mul(inLocal, inLocal, smoothLocal, cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(scaleLocal, tempLocal, tempLocal, cols_); // get max value and index [0,1]
|
||||
|
||||
float scaleValue = scaleLocal.GetValue(0) / MAX_INT8;
|
||||
|
||||
Duplicate<float>(scaleLocal, scaleValue, INT32_ONE_BLOCK_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Duplicate<float>(tempLocal, scaleValue, cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<int32_t> intLocal = tempLocal.ReinterpretCast<int32_t>();
|
||||
Cast(intLocal, tempLocal, RoundMode::CAST_RINT, cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
Cast(intLocal.ReinterpretCast<half>(), intLocal, RoundMode::CAST_ROUND, cols_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, intLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, cols_);
|
||||
|
||||
calcQueue_.FreeTensor(tempLocal);
|
||||
inputXOutQueue_.EnQue(outLocal);
|
||||
scaleOutQueue_.EnQue(scaleLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXDynamicQuantFromScatter(int64_t progress)
|
||||
{
|
||||
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams smoothParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
|
||||
LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
|
||||
|
||||
// copyin [1,H] scale
|
||||
if (smoothType_ == SCALE_1H) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm_, smoothParams, {false, 0, 0, 0});
|
||||
smoothInQueue_.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue_.DeQue<float>();
|
||||
}
|
||||
|
||||
int32_t lastExpertIdx = -1;
|
||||
for (int64_t i = 0; i < currentLoopRows_; i++) {
|
||||
int64_t rowOffset = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
|
||||
if (dropPadMode_ == DROPLESS_MODE && (rowOffset + i) >= activeNum_) {
|
||||
break;
|
||||
}
|
||||
LocalTensor<T> inLocal = inputXInQueue_.AllocTensor<T>();
|
||||
int32_t srcIdx = indicesLocal.GetValue(i);
|
||||
|
||||
int32_t expertIdx = indicesLocal.GetValue(currentLoopRowsAlign_ + i) - expertStart_;
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
DataCopyPad(inLocal, inputXGm_[srcIdx / k_ * cols_], copyInParams, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(inLocal[perLoopColsAlign_], inputXGm_[srcIdx / k_ * cols_], copyInParams, {false, 0, 0, 0});
|
||||
}
|
||||
inputXInQueue_.EnQue<T>(inLocal);
|
||||
|
||||
// copyin dynamic scale
|
||||
if (smoothType_ == SCALE_EH && expertIdx != lastExpertIdx) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm_[expertIdx * this->cols_], smoothParams, {false, 0, 0, 0});
|
||||
smoothInQueue_.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue_.DeQue<float>();
|
||||
lastExpertIdx = expertIdx;
|
||||
}
|
||||
Compute(smoothLocal);
|
||||
inputXInQueue_.FreeTensor(inLocal);
|
||||
LocalTensor<float> scaleLocal = scaleOutQueue_.DeQue<float>();
|
||||
DataCopyPad(expandedScaleGm_[(rowOffset + i)], scaleLocal, quantScaleParams);
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
|
||||
DataCopyPad(expandedXGm_[(rowOffset + i) * cols_], outLocal, copyOutParams);
|
||||
|
||||
inputXOutQueue_.FreeTensor(outLocal);
|
||||
scaleOutQueue_.FreeTensor(scaleLocal);
|
||||
}
|
||||
|
||||
smoothInQueue_.FreeTensor(smoothLocal);
|
||||
expandRowIdxInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXDynamicQuantFromGather(int64_t progress)
|
||||
{
|
||||
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams smoothParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
|
||||
LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
|
||||
|
||||
int64_t rowOffset = blockIdx_ * perCoreRow_ + progress * perLoopRows_;
|
||||
int64_t startXRow = rowOffset / k_;
|
||||
int64_t endXRow = (rowOffset + currentLoopRows_ - 1) / k_;
|
||||
int64_t curIndex = 0;
|
||||
|
||||
if (smoothType_ == SCALE_1H) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm_, smoothParams, {false, 0, 0, 0});
|
||||
smoothInQueue_.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue_.DeQue<float>();
|
||||
}
|
||||
|
||||
for (int64_t row = startXRow; row <= endXRow; row++) {
|
||||
LocalTensor<T> inLocal = inputXInQueue_.AllocTensor<T>();
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
DataCopyPad(inLocal, inputXGm_[row * cols_], copyInParams, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(inLocal[perLoopColsAlign_], inputXGm_[row * cols_], copyInParams, {false, 0, 0, 0});
|
||||
}
|
||||
inputXInQueue_.EnQue<T>(inLocal);
|
||||
Compute(smoothLocal);
|
||||
LocalTensor<float> scaleLocal = scaleOutQueue_.DeQue<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
|
||||
|
||||
while (curIndex < currentLoopRows_ && (rowOffset + curIndex) / this->k_ == row) {
|
||||
int32_t outIndex = indicesLocal.GetValue(curIndex);
|
||||
curIndex++;
|
||||
if (outIndex == -1 || dropPadMode_ == DROPLESS_MODE && outIndex >= this->activeNum_) {
|
||||
continue;
|
||||
}
|
||||
DataCopyPad(expandedXGm_[outIndex * cols_], outLocal, copyOutParams);
|
||||
DataCopyPad(expandedScaleGm_[outIndex], scaleLocal, quantScaleParams);
|
||||
}
|
||||
|
||||
inputXInQueue_.FreeTensor(inLocal);
|
||||
inputXOutQueue_.FreeTensor(outLocal);
|
||||
scaleOutQueue_.FreeTensor(scaleLocal);
|
||||
}
|
||||
|
||||
smoothInQueue_.FreeTensor(smoothLocal);
|
||||
expandRowIdxInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline float
|
||||
MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::ComputeMax(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
|
||||
LocalTensor<float> &scaleLocal, int32_t srcIdx, int32_t expertIdx,
|
||||
int64_t j)
|
||||
{
|
||||
LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
|
||||
|
||||
DataCopyExtParams intriParamsT{1, static_cast<uint32_t>(colsTileLength_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams intriParamsFp32{1, static_cast<uint32_t>(colsTileLength_ * sizeof(float)), 0, 0, 0};
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
DataCopyPad(inLocal.ReinterpretCast<T>()[perLoopColsAlign_], inputXGm_[srcIdx * cols_ + j * perLoopCols_],
|
||||
intriParamsT, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(inLocal, inputXGm_[srcIdx * cols_ + j * perLoopCols_], intriParamsT, {false, 0, 0, 0});
|
||||
}
|
||||
|
||||
inputXInQueue_.EnQue<float>(inLocal);
|
||||
inLocal = inputXInQueue_.DeQue<float>();
|
||||
|
||||
if (isInputScale_) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm_[expertIdx * cols_ + j * perLoopCols_], intriParamsFp32,
|
||||
{false, 0, 0, 0});
|
||||
smoothInQueue_.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue_.DeQue<float>();
|
||||
}
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign_], RoundMode::CAST_NONE, colsTileLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (isInputScale_) {
|
||||
Mul(inLocal, inLocal, smoothLocal, colsTileLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, colsTileLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(scaleLocal[INT32_ONE_BLOCK_NUM], tempLocal, tempLocal, colsTileLength_);
|
||||
|
||||
DataCopyPad(quantTempGm_[j * perLoopCols_], inLocal, intriParamsFp32);
|
||||
smoothInQueue_.FreeTensor(smoothLocal);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
return scaleLocal.GetValue(INT32_ONE_BLOCK_NUM);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void
|
||||
MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::ComputeScale(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
|
||||
float scaleTemp, int64_t dstIndex, int64_t j)
|
||||
{
|
||||
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(int8_t)), 0, 0, 0};
|
||||
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
|
||||
|
||||
DataCopyPad(inLocal, quantTempGm_[j * perLoopCols_], copyInParams, {false, 0, 0, 0});
|
||||
inputXInQueue_.EnQue<float>(inLocal);
|
||||
inLocal = inputXInQueue_.DeQue<float>();
|
||||
|
||||
Duplicate<float>(tempLocal, scaleTemp, colsTileLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, colsTileLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength_);
|
||||
|
||||
inputXOutQueue_.EnQue(outLocal);
|
||||
outLocal = inputXOutQueue_.DeQue<int8_t>();
|
||||
DataCopyPad(expandedXGm_[dstIndex * cols_ + j * perLoopCols_], outLocal, copyOutParams);
|
||||
|
||||
inputXOutQueue_.FreeTensor(outLocal);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void
|
||||
MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXPartialDynamicQuantFromScatter(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
|
||||
for (int64_t i = 0; i < currentLoopRows_; i++) {
|
||||
int64_t rowOffset = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
|
||||
if (dropPadMode_ == DROPLESS_MODE && (rowOffset + i) >= activeNum_) {
|
||||
break;
|
||||
}
|
||||
int32_t srcIdx = indicesLocal.GetValue(i);
|
||||
int32_t expertIdx = indicesLocal.GetValue(currentLoopRowsAlign_ + i) - expertStart_;
|
||||
LocalTensor<float> inLocal = inputXInQueue_.AllocTensor<float>();
|
||||
LocalTensor<float> tempLocal = calcQueue_.AllocTensor<float>();
|
||||
LocalTensor<float> scaleLocal = scaleOutQueue_.AllocTensor<float>();
|
||||
|
||||
float tileMax;
|
||||
float reduceMax = *((float *)&INF);
|
||||
for (int64_t j = 0; j < colLoops_; j++) {
|
||||
colsTileLength_ = perLoopCols_;
|
||||
if (j == colLoops_ - 1) {
|
||||
colsTileLength_ = lastLoopCols_;
|
||||
}
|
||||
|
||||
if (smoothType_ == SCALE_1H) {
|
||||
// 1H
|
||||
tileMax = ComputeMax(inLocal, tempLocal, scaleLocal, srcIdx / k_, 0, j);
|
||||
} else {
|
||||
// EH
|
||||
tileMax = ComputeMax(inLocal, tempLocal, scaleLocal, srcIdx / k_, expertIdx, j);
|
||||
}
|
||||
reduceMax = (reduceMax > tileMax) ? reduceMax : tileMax;
|
||||
}
|
||||
|
||||
float scaleTemp = reduceMax / MAX_INT8;
|
||||
Duplicate<float>(scaleLocal, scaleTemp, INT32_ONE_BLOCK_NUM);
|
||||
scaleOutQueue_.EnQue(scaleLocal);
|
||||
scaleLocal = scaleOutQueue_.DeQue<float>();
|
||||
|
||||
DataCopyPad(expandedScaleGm_[(rowOffset + i)], scaleLocal, {1, 4, 0, 0, 0});
|
||||
|
||||
for (int64_t j = 0; j < colLoops_; j++) {
|
||||
colsTileLength_ = perLoopCols_;
|
||||
if (j == colLoops_ - 1) {
|
||||
colsTileLength_ = lastLoopCols_;
|
||||
}
|
||||
ComputeScale(inLocal, tempLocal, scaleTemp, rowOffset + i, j);
|
||||
}
|
||||
inputXInQueue_.FreeTensor(inLocal);
|
||||
calcQueue_.FreeTensor(tempLocal);
|
||||
scaleOutQueue_.FreeTensor(scaleLocal);
|
||||
}
|
||||
expandRowIdxInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXPartialDynamicQuantFromGather(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
|
||||
int64_t rowOffset = blockIdx_ * perCoreRow_ + progress * perLoopRows_;
|
||||
int64_t startXRow = rowOffset / k_;
|
||||
int64_t endXRow = (rowOffset + currentLoopRows_ - 1) / k_;
|
||||
int64_t curIndex = 0;
|
||||
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
for (int64_t row = startXRow; row <= endXRow; row++) {
|
||||
LocalTensor<float> inLocal = inputXInQueue_.AllocTensor<float>();
|
||||
LocalTensor<float> tempLocal = calcQueue_.AllocTensor<float>();
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue_.AllocTensor<float>();
|
||||
|
||||
float reduceMax = *((float *)&INF);
|
||||
for (int64_t j = 0; j < colLoops_; j++) {
|
||||
colsTileLength_ = perLoopCols_;
|
||||
if (j == colLoops_ - 1) {
|
||||
colsTileLength_ = lastLoopCols_;
|
||||
}
|
||||
|
||||
float tileMax = ComputeMax(inLocal, tempLocal, quantScaleLocal, row, 0, j);
|
||||
reduceMax = (reduceMax > tileMax) ? reduceMax : tileMax;
|
||||
}
|
||||
|
||||
float scaleTemp = reduceMax / MAX_INT8;
|
||||
Duplicate<float>(quantScaleLocal, scaleTemp, INT32_ONE_BLOCK_NUM);
|
||||
scaleOutQueue_.EnQue(quantScaleLocal);
|
||||
quantScaleLocal = scaleOutQueue_.DeQue<float>();
|
||||
|
||||
while (curIndex < currentLoopRows_ && (curIndex + rowOffset) / k_ == row) {
|
||||
int32_t outIndex = indicesLocal.GetValue(curIndex);
|
||||
curIndex++;
|
||||
if (outIndex == -1 || (dropPadMode_ == DROPLESS_MODE && outIndex >= activeNum_)) {
|
||||
continue;
|
||||
}
|
||||
DataCopyPad(expandedScaleGm_[outIndex], quantScaleLocal, quantScaleParams);
|
||||
for (int64_t j = 0; j < colLoops_; j++) {
|
||||
colsTileLength_ = perLoopCols_;
|
||||
if (j == colLoops_ - 1) {
|
||||
colsTileLength_ = lastLoopCols_;
|
||||
}
|
||||
ComputeScale(inLocal, tempLocal, scaleTemp, outIndex, j);
|
||||
}
|
||||
}
|
||||
inputXInQueue_.FreeTensor(inLocal);
|
||||
calcQueue_.FreeTensor(tempLocal);
|
||||
scaleOutQueue_.FreeTensor(quantScaleLocal);
|
||||
}
|
||||
expandRowIdxInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void
|
||||
MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::Init(GM_ADDR inputX, GM_ADDR quantSmooth, GM_ADDR sortedExpertIdx,
|
||||
GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR expandedScale,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
pipe_ = tPipe;
|
||||
blockIdx_ = GetBlockIdx();
|
||||
gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
|
||||
cols_ = tilingData->cols;
|
||||
n_ = tilingData->n;
|
||||
k_ = tilingData->k;
|
||||
totalLength_ = n_ * k_;
|
||||
isInputScale_ = tilingData->isInputScale;
|
||||
expertStart_ = tilingData->expertStart;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
dropPadMode_ = tilingData->dropPadMode;
|
||||
activeNum_ = tilingData->activeNum;
|
||||
ep_ = tilingData->ep;
|
||||
smoothType_ = tilingData->smoothType;
|
||||
coreNum_ = tilingData->coreNum;
|
||||
|
||||
// core split
|
||||
int64_t actualExpertNum_ = tilingData->actualExpertNum;
|
||||
if (ep_) {
|
||||
expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + Align(n_ * k_, sizeof(int32_t)) * 2 +
|
||||
Align(actualExpertNum_, sizeof(int32_t)),
|
||||
1);
|
||||
AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
|
||||
AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
|
||||
expertTotalCount_ = expertTotalCountGm_.GetValue(0);
|
||||
} else {
|
||||
expertTotalCount_ = totalLength_;
|
||||
}
|
||||
|
||||
perCoreRow_ = Ceil(expertTotalCount_, tilingData->coreNum);
|
||||
needCoreNum_ = Ceil(expertTotalCount_, perCoreRow_);
|
||||
int64_t lastCoreIndicesElements = expertTotalCount_ - (needCoreNum_ - 1) * perCoreRow_;
|
||||
|
||||
// inner core split
|
||||
int64_t originPerLoopElements;
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
coreRows_ = lastCoreIndicesElements;
|
||||
originPerLoopElements = gatherOutTilingData_->lastCorePerLoopIndicesElements;
|
||||
} else {
|
||||
coreRows_ = perCoreRow_;
|
||||
originPerLoopElements = gatherOutTilingData_->perCorePerLoopIndicesElements;
|
||||
}
|
||||
perLoopRows_ = Min(coreRows_, originPerLoopElements);
|
||||
rowLoops_ = Ceil(coreRows_, perLoopRows_);
|
||||
lastLoopRows_ = coreRows_ - (rowLoops_ - 1) * perLoopRows_;
|
||||
|
||||
// cols split
|
||||
perLoopCols_ = gatherOutTilingData_->perLoopCols;
|
||||
lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
|
||||
colLoops_ = gatherOutTilingData_->colsLoops;
|
||||
|
||||
perLoopColsAlign_ = Align(perLoopCols_, sizeof(T));
|
||||
|
||||
inputXGm_.SetGlobalBuffer((__gm__ T *)inputX);
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
|
||||
|
||||
expandedExpertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + blockIdx_ * perCoreRow_,
|
||||
Align(coreRows_, sizeof(int32_t)));
|
||||
|
||||
if constexpr (COPYOUTTYPE == SCATTER) {
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
|
||||
Align(perCoreRow_, sizeof(int32_t)));
|
||||
} else {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + Align(n_ * k_, sizeof(int32_t)) +
|
||||
blockIdx_ * perCoreRow_,
|
||||
Align(perCoreRow_, sizeof(int32_t)));
|
||||
}
|
||||
} else {
|
||||
if (rowIdxType_ == GATHER) {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
|
||||
Align(perCoreRow_, sizeof(int32_t)));
|
||||
} else {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + Align(n_ * k_, sizeof(int32_t)) +
|
||||
blockIdx_ * perCoreRow_,
|
||||
Align(perCoreRow_, sizeof(int32_t)));
|
||||
}
|
||||
}
|
||||
|
||||
if (isInputScale_) {
|
||||
quantSmoothGm_.SetGlobalBuffer((__gm__ float *)quantSmooth);
|
||||
}
|
||||
expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
|
||||
|
||||
if (colLoops_ > 1) {
|
||||
quantTempGm_.SetGlobalBuffer((__gm__ float *)sortedExpertIdx + Align(totalLength_, sizeof(int32_t)) * 2 +
|
||||
Align(actualExpertNum_, sizeof(int32_t)) * 2 +
|
||||
Align(totalLength_, sizeof(int32_t)) + blockIdx_ * cols_,
|
||||
cols_ * sizeof(float));
|
||||
}
|
||||
|
||||
currentLoopRowsAlign_ = Align(perLoopRows_, sizeof(int32_t));
|
||||
|
||||
int64_t perLoopColsAlignBytes = AlignBytes(this->perLoopCols_, sizeof(T));
|
||||
perLoopColsAlignBytes =
|
||||
Max(int64_t(perLoopColsAlignBytes * sizeof(float) / sizeof(T)), int64_t(BLOCK_BYTES + BLOCK_BYTES));
|
||||
pipe_->InitBuffer(expandRowIdxInQueue_, GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM,
|
||||
2 * AlignBytes(perLoopRows_, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(inputXInQueue_, GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM, perLoopColsAlignBytes); // percols * 2 * 4
|
||||
pipe_->InitBuffer(smoothInQueue_, GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM,
|
||||
AlignBytes(perLoopCols_, sizeof(float))); // percols * 2 * 4
|
||||
pipe_->InitBuffer(calcQueue_, 1, AlignBytes(perLoopCols_, sizeof(float))); // percols * 1 * 4
|
||||
pipe_->InitBuffer(inputXOutQueue_, 1, AlignBytes(perLoopCols_, sizeof(int8_t))); // percols * 1
|
||||
pipe_->InitBuffer(scaleOutQueue_, 1, BLOCK_BYTES + BLOCK_BYTES); // 32 + 32
|
||||
}
|
||||
|
||||
template <typename T, const int COPYOUTTYPE>
|
||||
__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::Process()
|
||||
{
|
||||
if (blockIdx_ < needCoreNum_) {
|
||||
currentLoopRows_ = perLoopRows_;
|
||||
if (colLoops_ > 1) {
|
||||
for (int64_t loop = 0; loop < rowLoops_; loop++) {
|
||||
if (loop == rowLoops_ - 1) {
|
||||
currentLoopRows_ = lastLoopRows_;
|
||||
}
|
||||
CopyInExpandedExpertIdx(loop);
|
||||
if constexpr (COPYOUTTYPE == GATHER) {
|
||||
CopyOutXPartialDynamicQuantFromGather(loop);
|
||||
} else {
|
||||
CopyOutXPartialDynamicQuantFromScatter(loop);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int64_t loop = 0; loop < rowLoops_; loop++) {
|
||||
if (loop == rowLoops_ - 1) {
|
||||
currentLoopRows_ = lastLoopRows_;
|
||||
}
|
||||
CopyInExpandedExpertIdx(loop);
|
||||
if constexpr (COPYOUTTYPE == GATHER) {
|
||||
CopyOutXDynamicQuantFromGather(loop);
|
||||
} else {
|
||||
CopyOutXDynamicQuantFromScatter(loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_GATHER_DYNAMIC_QUANT_H
|
||||
321
csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_out.h
Normal file
321
csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_out.h
Normal file
@@ -0,0 +1,321 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_gather_out.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_GATHER_OUT_H
|
||||
#define MOE_CUSTOM_GATHER_OUT_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
constexpr int64_t GATHER_OUT_BUFFER_NUM = 2;
|
||||
|
||||
template <typename T, const int EP>
|
||||
class MoeGatherOut {
|
||||
public:
|
||||
__aicore__ inline MoeGatherOut(){};
|
||||
__aicore__ inline void Init(GM_ADDR x, GM_ADDR scale, GM_ADDR workspace, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
|
||||
GM_ADDR expandedScale, const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
__aicore__ inline void CopyExpertIn(int64_t progress);
|
||||
__aicore__ inline void CopyXIn(int64_t xSrcOffset, int64_t curLoopCols);
|
||||
__aicore__ inline void CopyXOut(int64_t xDstOffset, int64_t curLoopCols);
|
||||
__aicore__ inline void CopyScaleIn(int64_t scaleSrcOffset);
|
||||
__aicore__ inline void CopyScaleOut(int64_t scaleDstOffset);
|
||||
__aicore__ inline void GatherCopyOut(int64_t progress);
|
||||
__aicore__ inline void ScatterCopyOut(int64_t progress);
|
||||
|
||||
private:
|
||||
TPipe *pipe_;
|
||||
TQueBind<TPosition::VECIN, TPosition::VECOUT, GATHER_OUT_BUFFER_NUM> xCopyInQueue_;
|
||||
TQueBind<TPosition::VECIN, TPosition::VECOUT, GATHER_OUT_BUFFER_NUM> scaleCopyInQueue_;
|
||||
TQue<QuePosition::VECIN, GATHER_OUT_BUFFER_NUM> expandedRowIdxCopyInQueue_;
|
||||
|
||||
GlobalTensor<T> xGm_;
|
||||
GlobalTensor<float> xGscaleGm_;
|
||||
GlobalTensor<int32_t> sortedExpertIdxGm_;
|
||||
GlobalTensor<T> expandedXGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<float> expandedScaleGm_;
|
||||
GlobalTensor<int32_t> expertTotalCountGm_;
|
||||
|
||||
int64_t blockIdx_;
|
||||
int64_t cols_;
|
||||
int64_t n_;
|
||||
int64_t k_;
|
||||
int64_t activeNum_;
|
||||
int64_t dropPadMode_;
|
||||
|
||||
int64_t colsLoops_;
|
||||
int64_t perLoopCols_;
|
||||
int64_t lastLoopCols_;
|
||||
|
||||
int64_t indicesLoops_;
|
||||
int64_t curLoopElements_;
|
||||
|
||||
int64_t perCoreIndicesElements_;
|
||||
int64_t lastCoreIndicesElements_;
|
||||
int64_t perCorePerLoopIndicesElements_;
|
||||
int64_t lastCorePerLoopIndicesElements_;
|
||||
int64_t curCorePerLoopIndicesElements_;
|
||||
int64_t curCoreLastLoopIndicesElements_;
|
||||
int64_t needCoreNum_;
|
||||
int64_t curCoreIndicesElements_;
|
||||
|
||||
int64_t actualExpertNum_;
|
||||
int64_t expertTotalCount_;
|
||||
|
||||
int64_t rowIdxType_;
|
||||
int64_t isInputScale_;
|
||||
int64_t coreNum_;
|
||||
};
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::Init(GM_ADDR x, GM_ADDR scale, GM_ADDR workspace, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expandedX, GM_ADDR expandedScale,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
pipe_ = tPipe;
|
||||
blockIdx_ = GetBlockIdx();
|
||||
|
||||
cols_ = tilingData->cols;
|
||||
n_ = tilingData->n;
|
||||
k_ = tilingData->k;
|
||||
coreNum_ = tilingData->coreNum;
|
||||
dropPadMode_ = tilingData->dropPadMode;
|
||||
activeNum_ = tilingData->activeNum;
|
||||
|
||||
isInputScale_ = tilingData->isInputScale;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
|
||||
colsLoops_ = tilingData->gatherOutComputeParamsOp.colsLoops;
|
||||
perLoopCols_ = tilingData->gatherOutComputeParamsOp.perLoopCols;
|
||||
lastLoopCols_ = tilingData->gatherOutComputeParamsOp.lastLoopCols;
|
||||
|
||||
actualExpertNum_ = tilingData->actualExpertNum;
|
||||
|
||||
if constexpr (EP) {
|
||||
expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) * 2 +
|
||||
Align(actualExpertNum_, sizeof(int32_t)),
|
||||
1);
|
||||
AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
|
||||
AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
|
||||
expertTotalCount_ = expertTotalCountGm_.GetValue(0);
|
||||
} else {
|
||||
expertTotalCount_ = n_ * k_;
|
||||
}
|
||||
|
||||
perCorePerLoopIndicesElements_ = tilingData->gatherOutComputeParamsOp.perCorePerLoopIndicesElements;
|
||||
lastCorePerLoopIndicesElements_ = tilingData->gatherOutComputeParamsOp.lastCorePerLoopIndicesElements;
|
||||
perCoreIndicesElements_ = Ceil(expertTotalCount_, tilingData->coreNum);
|
||||
needCoreNum_ = Ceil(expertTotalCount_, perCoreIndicesElements_);
|
||||
lastCoreIndicesElements_ = expertTotalCount_ - (needCoreNum_ - 1) * perCoreIndicesElements_;
|
||||
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
curCoreIndicesElements_ = lastCoreIndicesElements_;
|
||||
curCorePerLoopIndicesElements_ = Min(lastCorePerLoopIndicesElements_, curCoreIndicesElements_);
|
||||
} else {
|
||||
curCoreIndicesElements_ = perCoreIndicesElements_;
|
||||
curCorePerLoopIndicesElements_ = Min(perCorePerLoopIndicesElements_, curCoreIndicesElements_);
|
||||
}
|
||||
indicesLoops_ = Ceil(curCoreIndicesElements_, curCorePerLoopIndicesElements_);
|
||||
curCoreLastLoopIndicesElements_ = curCoreIndicesElements_ - (indicesLoops_ - 1) * curCorePerLoopIndicesElements_;
|
||||
|
||||
xGm_.SetGlobalBuffer((__gm__ T *)x, n_ * cols_);
|
||||
xGscaleGm_.SetGlobalBuffer((__gm__ float *)scale, n_);
|
||||
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX);
|
||||
expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
|
||||
|
||||
pipe_->InitBuffer(expandedRowIdxCopyInQueue_, GATHER_OUT_BUFFER_NUM,
|
||||
AlignBytes(curCorePerLoopIndicesElements_, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(xCopyInQueue_, GATHER_OUT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
|
||||
pipe_->InitBuffer(scaleCopyInQueue_, GATHER_OUT_BUFFER_NUM, AlignBytes(1, sizeof(float)));
|
||||
|
||||
sortedExpertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + blockIdx_ * perCoreIndicesElements_,
|
||||
Align(curCoreIndicesElements_, sizeof(int32_t)));
|
||||
|
||||
if constexpr (EP) {
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreIndicesElements_,
|
||||
Align(curCoreIndicesElements_, sizeof(int32_t)));
|
||||
} else {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
|
||||
blockIdx_ * perCoreIndicesElements_,
|
||||
Align(curCoreIndicesElements_, sizeof(int32_t)));
|
||||
}
|
||||
} else {
|
||||
if (rowIdxType_ == GATHER) {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreIndicesElements_,
|
||||
Align(curCoreIndicesElements_, sizeof(int32_t)));
|
||||
} else {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
|
||||
blockIdx_ * perCoreIndicesElements_,
|
||||
Align(curCoreIndicesElements_, sizeof(int32_t)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::CopyExpertIn(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> subRowIdxLocal = expandedRowIdxCopyInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams copyParams{1, static_cast<uint32_t>(curLoopElements_ * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams<int32_t> padParams{false, 0, 0, 0};
|
||||
DataCopyPad(subRowIdxLocal, expandedRowIdxGm_[progress * curCorePerLoopIndicesElements_], copyParams, padParams);
|
||||
expandedRowIdxCopyInQueue_.EnQue(subRowIdxLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::CopyXIn(int64_t xSrcOffset, int64_t curLoopCols)
|
||||
{
|
||||
LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
|
||||
DataCopyExtParams copyParams0{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> padParams0{false, 0, 0, 0};
|
||||
DataCopyPad(xLocal, xGm_[xSrcOffset], copyParams0, padParams0);
|
||||
xCopyInQueue_.EnQue(xLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::CopyXOut(int64_t xDstOffset, int64_t curLoopCols)
|
||||
{
|
||||
LocalTensor<T> xLocal = xCopyInQueue_.DeQue<T>();
|
||||
DataCopyExtParams copyParams2{1, static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPad(expandedXGm_[xDstOffset], xLocal, copyParams2);
|
||||
xCopyInQueue_.FreeTensor(xLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::CopyScaleIn(int64_t scaleSrcOffset)
|
||||
{
|
||||
LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
|
||||
DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(1 * sizeof(float)), 0, 0, 0};
|
||||
DataCopyPadExtParams<float> padParams1{false, 0, 0, 0};
|
||||
DataCopyPad(scaleLocal, xGscaleGm_[scaleSrcOffset], copyParams1, padParams1);
|
||||
scaleCopyInQueue_.EnQue(scaleLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::CopyScaleOut(int64_t scaleDstOffset)
|
||||
{
|
||||
LocalTensor<float> scaleLocal = scaleCopyInQueue_.DeQue<float>();
|
||||
DataCopyExtParams copyParams3{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
DataCopyPad(expandedScaleGm_[scaleDstOffset], scaleLocal, copyParams3);
|
||||
scaleCopyInQueue_.FreeTensor(scaleLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::GatherCopyOut(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> subRowIdxLocal = expandedRowIdxCopyInQueue_.DeQue<int32_t>();
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
int64_t curLoopCols = perLoopCols_;
|
||||
for (int64_t colsLoop = 0; colsLoop < colsLoops_; colsLoop++) {
|
||||
int64_t initialRow = blockIdx_ * perCoreIndicesElements_ + curCorePerLoopIndicesElements_ * progress;
|
||||
int64_t curLoopRow = 0;
|
||||
if (colsLoop == colsLoops_ - 1) {
|
||||
curLoopCols = lastLoopCols_;
|
||||
}
|
||||
int64_t currentLoopStartRow = initialRow / k_;
|
||||
int64_t currentLoopLastRow = (initialRow + this->curLoopElements_ - 1) / k_;
|
||||
for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
|
||||
LocalTensor<T> inLocal = xCopyInQueue_.AllocTensor<T>();
|
||||
int64_t inputOffset = row * cols_ + colsLoop * perLoopCols_;
|
||||
DataCopyExtParams xCopyParams{1, static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal, xGm_[inputOffset], xCopyParams, dataCopyPadParams);
|
||||
// copy in scale
|
||||
LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
|
||||
DataCopyExtParams scaleCopyParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
if (isInputScale_ == 1 && colsLoop == 0) {
|
||||
DataCopyPadExtParams<float> scalePadParams{false, 0, 0, 0};
|
||||
DataCopyPad(scaleLocal, xGscaleGm_[row], scaleCopyParams, scalePadParams);
|
||||
}
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
|
||||
while (curLoopRow < this->curLoopElements_ && initialRow / k_ == row) {
|
||||
int32_t outIndex = subRowIdxLocal.GetValue(curLoopRow);
|
||||
curLoopRow++;
|
||||
initialRow++;
|
||||
if (outIndex == -1 || (dropPadMode_ == DROPLESS_MODE && outIndex >= activeNum_)) {
|
||||
continue;
|
||||
}
|
||||
int64_t outOffset = outIndex * this->cols_ + colsLoop * this->perLoopCols_;
|
||||
DataCopyPad(expandedXGm_[outOffset], inLocal, intriParams);
|
||||
// copy out scale
|
||||
if (isInputScale_ == 1 && colsLoop == 0) {
|
||||
DataCopyPad(expandedScaleGm_[outIndex], scaleLocal, scaleCopyParams);
|
||||
}
|
||||
}
|
||||
scaleCopyInQueue_.FreeTensor(scaleLocal);
|
||||
xCopyInQueue_.FreeTensor(inLocal);
|
||||
}
|
||||
}
|
||||
expandedRowIdxCopyInQueue_.FreeTensor(subRowIdxLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::ScatterCopyOut(int64_t progress)
|
||||
{
|
||||
int64_t curExpertLoopOffset = progress * curCorePerLoopIndicesElements_;
|
||||
LocalTensor<int32_t> subRowIdxLocal = expandedRowIdxCopyInQueue_.DeQue<int32_t>();
|
||||
for (int64_t indicesIndex = 0; indicesIndex < curLoopElements_; indicesIndex++) {
|
||||
int64_t rowIdx = subRowIdxLocal.GetValue(indicesIndex);
|
||||
int64_t rowOffset = curExpertLoopOffset + indicesIndex + blockIdx_ * perCoreIndicesElements_;
|
||||
if (activeNum_ > 0 && dropPadMode_ == DROPLESS_MODE && rowOffset >= activeNum_) {
|
||||
break;
|
||||
}
|
||||
SetWaitFlag<HardEvent::S_MTE2>(HardEvent::S_MTE2);
|
||||
if (isInputScale_ == 1) {
|
||||
int64_t scaleSrcOffset = rowIdx / k_;
|
||||
CopyScaleIn(scaleSrcOffset);
|
||||
CopyScaleOut(indicesIndex + curExpertLoopOffset + blockIdx_ * perCoreIndicesElements_);
|
||||
}
|
||||
int64_t curLoopCols = perLoopCols_;
|
||||
for (int64_t colsLoop = 0; colsLoop < colsLoops_; colsLoop++) {
|
||||
if (colsLoop == colsLoops_ - 1) {
|
||||
curLoopCols = lastLoopCols_;
|
||||
}
|
||||
int64_t xSrcOffset = rowIdx / k_ * cols_;
|
||||
int64_t xDstOffset = (blockIdx_ * perCoreIndicesElements_ + curExpertLoopOffset + indicesIndex) * cols_;
|
||||
int64_t colsLoopOffset = colsLoop * perLoopCols_;
|
||||
CopyXIn(xSrcOffset + colsLoopOffset, curLoopCols);
|
||||
CopyXOut(xDstOffset + colsLoopOffset, curLoopCols);
|
||||
}
|
||||
}
|
||||
expandedRowIdxCopyInQueue_.FreeTensor(subRowIdxLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOut<T, EP>::Process()
|
||||
{
|
||||
if (blockIdx_ < needCoreNum_) {
|
||||
curLoopElements_ = curCorePerLoopIndicesElements_;
|
||||
for (int64_t loop = 0; loop < indicesLoops_; loop++) {
|
||||
if (loop == indicesLoops_ - 1) {
|
||||
curLoopElements_ = curCoreLastLoopIndicesElements_;
|
||||
}
|
||||
CopyExpertIn(loop);
|
||||
if constexpr (!EP) {
|
||||
GatherCopyOut(loop);
|
||||
} else {
|
||||
ScatterCopyOut(loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_GATHER_OUT_H
|
||||
@@ -0,0 +1,210 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_gather_out_droppad.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_GATHER_OUT_DROPPAD_H
|
||||
#define MOE_CUSTOM_GATHER_OUT_DROPPAD_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
constexpr int64_t GATHER_OUT_DROPPAD_BUFFER_NUM = 2;
|
||||
|
||||
template <typename T>
|
||||
class MoeGatherOutDroppad {
|
||||
public:
|
||||
__aicore__ inline MoeGatherOutDroppad(){};
|
||||
__aicore__ inline void Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
|
||||
GM_ADDR expandedScale, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
|
||||
TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyInIndices(int64_t progress);
|
||||
__aicore__ inline void CopyOut(int64_t progress);
|
||||
__aicore__ inline void CopyScaleIn(int64_t scaleSrcOffset, LocalTensor<float> scaleLocal);
|
||||
__aicore__ inline void CopyScaleOut(int64_t scaleDstOffset, LocalTensor<float> scaleLocal);
|
||||
|
||||
private:
|
||||
TPipe *pipe_;
|
||||
TQueBind<QuePosition::VECIN, QuePosition::VECOUT, GATHER_OUT_DROPPAD_BUFFER_NUM> xCopyInQueue_;
|
||||
TQueBind<TPosition::VECIN, TPosition::VECOUT, GATHER_OUT_DROPPAD_BUFFER_NUM> scaleCopyInQueue_;
|
||||
TQue<QuePosition::VECIN, GATHER_OUT_DROPPAD_BUFFER_NUM> expandedRowIdxCopyInQueue_;
|
||||
|
||||
GlobalTensor<T> inputXGm_;
|
||||
GlobalTensor<float> xGscaleGm_;
|
||||
GlobalTensor<T> expandedXGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<float> expandedScaleGm_;
|
||||
|
||||
const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
|
||||
|
||||
int64_t needCoreNum_;
|
||||
int64_t blockIdx_;
|
||||
int64_t cols_;
|
||||
int64_t n_;
|
||||
int64_t k_;
|
||||
int64_t currentLoopRows_;
|
||||
int64_t coreRows_;
|
||||
int64_t perLoopRows_;
|
||||
int64_t lastLoopRows_;
|
||||
int64_t rowLoops_;
|
||||
int64_t colsTileLength_;
|
||||
int64_t perLoopCols_;
|
||||
int64_t lastLoopCols_;
|
||||
int64_t colLoops_;
|
||||
int64_t isInputScale_;
|
||||
|
||||
int64_t indicesOffset_;
|
||||
int64_t inputOffset_;
|
||||
int64_t outOffset_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherOutDroppad<T>::CopyInIndices(int64_t progress)
|
||||
{
|
||||
indicesOffset_ = progress * perLoopRows_;
|
||||
LocalTensor<int32_t> indicesLocal = expandedRowIdxCopyInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
|
||||
expandedRowIdxCopyInQueue_.EnQue<int32_t>(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherOutDroppad<T>::CopyScaleIn(int64_t scaleSrcOffset, LocalTensor<float> scaleLocal)
|
||||
{
|
||||
DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(1 * sizeof(float)), 0, 0, 0};
|
||||
DataCopyPadExtParams<float> padParams1{false, 0, 0, 0};
|
||||
DataCopyPad(scaleLocal, xGscaleGm_[scaleSrcOffset], copyParams1, padParams1);
|
||||
scaleCopyInQueue_.EnQue(scaleLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherOutDroppad<T>::CopyScaleOut(int64_t scaleDstOffset, LocalTensor<float> scaleLocal)
|
||||
{
|
||||
DataCopyExtParams copyParams3{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
DataCopyPad(expandedScaleGm_[scaleDstOffset], scaleLocal, copyParams3);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherOutDroppad<T>::CopyOut(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> indicesLocal = expandedRowIdxCopyInQueue_.DeQue<int32_t>();
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
colsTileLength_ = perLoopCols_;
|
||||
for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
|
||||
int64_t initialRow = gatherOutTilingData_->perCoreIndicesElements * blockIdx_ + perLoopRows_ * progress;
|
||||
int64_t curLoopRow = 0;
|
||||
if (colsLoop == colLoops_ - 1) {
|
||||
colsTileLength_ = lastLoopCols_;
|
||||
}
|
||||
int64_t currentLoopStartRow = initialRow / k_;
|
||||
int64_t currentLoopLastRow = (initialRow + currentLoopRows_ - 1) / k_;
|
||||
for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
|
||||
LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
|
||||
if (isInputScale_ == 1) {
|
||||
CopyScaleIn(row, scaleLocal);
|
||||
LocalTensor<float> scaleLocal = scaleCopyInQueue_.DeQue<float>();
|
||||
}
|
||||
inputOffset_ = row * cols_ + colsLoop * perLoopCols_;
|
||||
// input row position
|
||||
LocalTensor<T> inLocal = xCopyInQueue_.AllocTensor<T>();
|
||||
DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal, inputXGm_[inputOffset_], dataCopyParams, dataCopyPadParams);
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(T)), 0, 0, 0};
|
||||
while (curLoopRow < currentLoopRows_ && initialRow / k_ == row) {
|
||||
int32_t outIndex = indicesLocal.GetValue(curLoopRow);
|
||||
curLoopRow++;
|
||||
initialRow++;
|
||||
if (outIndex == -1) {
|
||||
continue;
|
||||
}
|
||||
outOffset_ = outIndex * cols_ + colsLoop * perLoopCols_;
|
||||
DataCopyPad(expandedXGm_[outOffset_], inLocal, intriParams);
|
||||
if (isInputScale_ == 1) {
|
||||
CopyScaleOut(outIndex, scaleLocal);
|
||||
}
|
||||
}
|
||||
xCopyInQueue_.FreeTensor(inLocal);
|
||||
scaleCopyInQueue_.FreeTensor(scaleLocal);
|
||||
}
|
||||
}
|
||||
expandedRowIdxCopyInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherOutDroppad<T>::Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expandedX, GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
pipe_ = tPipe;
|
||||
blockIdx_ = GetBlockIdx();
|
||||
gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
|
||||
|
||||
needCoreNum_ = gatherOutTilingData_->needCoreNum;
|
||||
cols_ = tilingData->cols;
|
||||
n_ = tilingData->n;
|
||||
k_ = tilingData->k;
|
||||
isInputScale_ = tilingData->isInputScale;
|
||||
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
coreRows_ = gatherOutTilingData_->lastCoreIndicesElements;
|
||||
perLoopRows_ = gatherOutTilingData_->lastCorePerLoopIndicesElements;
|
||||
lastLoopRows_ = gatherOutTilingData_->lastCoreLastLoopIndicesElements;
|
||||
rowLoops_ = gatherOutTilingData_->lastCoreIndicesLoops;
|
||||
} else {
|
||||
coreRows_ = gatherOutTilingData_->perCoreIndicesElements;
|
||||
perLoopRows_ = gatherOutTilingData_->perCorePerLoopIndicesElements;
|
||||
lastLoopRows_ = gatherOutTilingData_->perCoreLastLoopIndicesElements;
|
||||
rowLoops_ = gatherOutTilingData_->perCoreIndicesLoops;
|
||||
}
|
||||
perLoopCols_ = gatherOutTilingData_->perLoopCols;
|
||||
lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
|
||||
colLoops_ = gatherOutTilingData_->colsLoops;
|
||||
|
||||
inputXGm_.SetGlobalBuffer((__gm__ T *)inputX, coreRows_ * cols_);
|
||||
xGscaleGm_.SetGlobalBuffer((__gm__ float *)scale, n_);
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX, n_ * k_ * cols_);
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx +
|
||||
blockIdx_ * gatherOutTilingData_->perCoreIndicesElements,
|
||||
Align(coreRows_, sizeof(int32_t)));
|
||||
expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
|
||||
|
||||
pipe_->InitBuffer(xCopyInQueue_, GATHER_OUT_DROPPAD_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
|
||||
pipe_->InitBuffer(expandedRowIdxCopyInQueue_, GATHER_OUT_DROPPAD_BUFFER_NUM,
|
||||
AlignBytes(perLoopRows_, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(scaleCopyInQueue_, GATHER_OUT_DROPPAD_BUFFER_NUM, AlignBytes(1, sizeof(float)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeGatherOutDroppad<T>::Process()
|
||||
{
|
||||
if (blockIdx_ < needCoreNum_) {
|
||||
currentLoopRows_ = perLoopRows_;
|
||||
for (int64_t loop = 0; loop < rowLoops_; loop++) {
|
||||
if (loop == rowLoops_ - 1) {
|
||||
currentLoopRows_ = lastLoopRows_;
|
||||
}
|
||||
CopyInIndices(loop);
|
||||
CopyOut(loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_GATHER_OUT_DROPPAD_H
|
||||
@@ -0,0 +1,242 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_gather_sort_multi_core.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_GATHER_SORT_MULTI_CORE_H
|
||||
#define MOE_CUSTOM_GATHER_SORT_MULTI_CORE_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
constexpr int64_t SORT32_ALIGN_ELEMENT = 32;
|
||||
constexpr int64_t PARALLEL_GATHERED_SORT_NEED_CORE_NUM = 16;
|
||||
constexpr int64_t MULTI_GATHERED_MAX_NUM = 4096; // 8192 * 8 / 16
|
||||
|
||||
class MoeGatherSortMultiCore {
|
||||
public:
|
||||
__aicore__ inline MoeGatherSortMultiCore(){};
|
||||
__aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void Compute();
|
||||
__aicore__ inline void CopyOut();
|
||||
|
||||
private:
|
||||
TPipe *pipe_;
|
||||
TBuf<TPosition::VECCALC> buffer_;
|
||||
GlobalTensor<int32_t> workspaceGm_;
|
||||
GlobalTensor<int32_t> expendedRowIdxGm_;
|
||||
GlobalTensor<int32_t> expertIdxGm_;
|
||||
GlobalTensor<float> sortedExpertIdxGm_;
|
||||
GlobalTensor<int32_t> sortedExpertIndexGm_;
|
||||
GlobalTensor<int32_t> sortedNumGm_;
|
||||
|
||||
TQue<QuePosition::VECOUT, 1> sortedNumCopyOutQueue_;
|
||||
|
||||
int64_t expertIdxOffset_ = 0;
|
||||
int64_t expertIndexOffset_ = 0;
|
||||
int64_t compareScalarMask0Offset_ = 0;
|
||||
int64_t compareScalarMask1Offset_ = 0;
|
||||
int64_t gatherMaskOffset_ = 0;
|
||||
|
||||
int64_t totalLength_;
|
||||
int64_t expertStart_ = 0;
|
||||
int64_t expertEnd_ = 0;
|
||||
int64_t actual_expert_num_ = 0;
|
||||
int64_t needCoreNum_ = 0;
|
||||
int64_t perCoreElements_ = 0;
|
||||
int64_t blockIdx_;
|
||||
int64_t currentCoreElements_ = 0;
|
||||
int64_t needSortNum_ = 0;
|
||||
int64_t kvFactor = 2;
|
||||
|
||||
static constexpr int64_t DST_BLK_STRIDE = 1;
|
||||
static constexpr int64_t DST_REP_STRIDE = 8;
|
||||
static constexpr int64_t MASK_STRIDE = 64;
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeGatherSortMultiCore::CopyIn()
|
||||
{
|
||||
LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
|
||||
|
||||
DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(currentCoreElements_ * sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
DataCopyPad(expertIdx, expertIdxGm_[blockIdx_ * perCoreElements_], dataCopyParams, dataCopyPadParams);
|
||||
SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeGatherSortMultiCore::Compute()
|
||||
{
|
||||
LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
|
||||
LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
|
||||
LocalTensor<int32_t> gatheredExpertIdx = buffer_.Get<int32_t>();
|
||||
LocalTensor<float> gatheredExpertIdxFp32 = gatheredExpertIdx.ReinterpretCast<float>();
|
||||
|
||||
Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, currentCoreElements_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Muls(expertIdxFp32, expertIdxFp32, (float)-1, currentCoreElements_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<uint8_t> compareScalarMaskLocalTensor0 = buffer_.Get<uint8_t>()[compareScalarMask0Offset_];
|
||||
LocalTensor<uint8_t> compareScalarMaskLocalTensor1 = buffer_.Get<uint8_t>()[compareScalarMask1Offset_];
|
||||
LocalTensor<uint8_t> gatherMaskLocalTensor = buffer_.Get<uint8_t>()[gatherMaskOffset_];
|
||||
|
||||
// Find elements >= expertStart_, which means -elements <= -expertStart_
|
||||
AscendC::CompareScalar(
|
||||
compareScalarMaskLocalTensor0, expertIdxFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::LE,
|
||||
(currentCoreElements_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
// Find elements < expertEnd_, which means -elements > -expertEnd_
|
||||
AscendC::CompareScalar(
|
||||
compareScalarMaskLocalTensor1, expertIdxFp32, static_cast<float>(-expertEnd_), AscendC::CMPMODE::GT,
|
||||
(currentCoreElements_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
// Get experts between [expert_start, expert_end)
|
||||
And(gatherMaskLocalTensor.ReinterpretCast<uint16_t>(), compareScalarMaskLocalTensor0.ReinterpretCast<uint16_t>(),
|
||||
compareScalarMaskLocalTensor1.ReinterpretCast<uint16_t>(),
|
||||
Ceil(currentCoreElements_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE / kvFactor);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
uint64_t sortedNum = 0;
|
||||
GatherMaskParams gatherMaskParams;
|
||||
gatherMaskParams.repeatTimes = 1;
|
||||
gatherMaskParams.src0BlockStride = 1;
|
||||
gatherMaskParams.src0RepeatStride = DST_REP_STRIDE;
|
||||
gatherMaskParams.src1RepeatStride = DST_REP_STRIDE;
|
||||
GatherMask(gatheredExpertIdxFp32, expertIdxFp32, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
|
||||
static_cast<uint32_t>(currentCoreElements_), gatherMaskParams, sortedNum);
|
||||
PipeBarrier<PIPE_V>();
|
||||
actual_expert_num_ = sortedNum;
|
||||
int64_t needSortNum = Ceil(static_cast<int64_t>(sortedNum), ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
needSortNum_ = needSortNum;
|
||||
|
||||
// Handle actual_expert_num_ == 0
|
||||
if (actual_expert_num_ < 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
LocalTensor<int32_t> expertIndex = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
|
||||
LocalTensor<int32_t> gatheredExpertIndex = buffer_.Get<int32_t>()[needSortNum];
|
||||
ArithProgression<int32_t>(expertIndex, blockIdx_ * perCoreElements_, 1, currentCoreElements_);
|
||||
GatherMask(gatheredExpertIndex, expertIndex, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
|
||||
static_cast<uint32_t>(currentCoreElements_), gatherMaskParams, sortedNum);
|
||||
PipeBarrier<PIPE_V>();
|
||||
int64_t duplicateNum = sortedNum % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = sortedNum - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(gatheredExpertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
}
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> sortTempTensor = buffer_.Get<float>()[needSortNum * kvFactor];
|
||||
Concat(concatLocal, gatheredExpertIdxFp32, sortTempTensor, needSortNum / ONE_REPEAT_SORT_NUM);
|
||||
LocalTensor<float> sortedLocal = buffer_.Get<float>()[needSortNum * kvFactor + needSortNum * kvFactor * kvFactor];
|
||||
Sort<float, true>(sortedLocal, concatLocal, gatheredExpertIndex.ReinterpretCast<uint32_t>(), sortTempTensor,
|
||||
needSortNum / ONE_REPEAT_SORT_NUM);
|
||||
SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeGatherSortMultiCore::CopyOut()
|
||||
{
|
||||
// Copy out sortedLocal for MergeSort
|
||||
if (actual_expert_num_ > 0) {
|
||||
LocalTensor<float> sortedLocal =
|
||||
buffer_.Get<float>()[needSortNum_ * kvFactor + needSortNum_ * kvFactor * kvFactor];
|
||||
DataCopyExtParams extParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(2 * actual_expert_num_ * sizeof(float)), 0, 0, 0};
|
||||
int64_t curCoreStartIndex = 2 * GetBlockIdx() * perCoreElements_;
|
||||
DataCopyPad(sortedExpertIdxGm_[curCoreStartIndex], sortedLocal, extParams);
|
||||
}
|
||||
|
||||
// Copyout actual_expert_num_
|
||||
LocalTensor<int32_t> sortedNumOutLocal = sortedNumCopyOutQueue_.AllocTensor<int32_t>();
|
||||
sortedNumOutLocal.SetValue(0, actual_expert_num_);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyExtParams copyParams3{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(uint32_t)), 0, 0, 0};
|
||||
DataCopyPad(sortedNumGm_[GetBlockIdx()], sortedNumOutLocal, copyParams3);
|
||||
|
||||
sortedNumCopyOutQueue_.FreeTensor(sortedNumOutLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeGatherSortMultiCore::Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
pipe_ = tPipe;
|
||||
blockIdx_ = GetBlockIdx();
|
||||
totalLength_ = tilingData->n * tilingData->k;
|
||||
|
||||
expertStart_ = tilingData->expertStart;
|
||||
expertEnd_ = tilingData->expertEnd;
|
||||
|
||||
expertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expertIdx);
|
||||
|
||||
expendedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx);
|
||||
|
||||
workspaceGm_.SetGlobalBuffer((__gm__ int32_t *)workspace);
|
||||
|
||||
sortedExpertIdxGm_.SetGlobalBuffer((__gm__ float *)workspace);
|
||||
sortedExpertIndexGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(totalLength_, sizeof(int32_t)));
|
||||
|
||||
// key and value
|
||||
sortedNumGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(totalLength_, sizeof(int32_t)) * kvFactor * kvFactor);
|
||||
|
||||
needCoreNum_ = PARALLEL_GATHERED_SORT_NEED_CORE_NUM;
|
||||
perCoreElements_ = Ceil(totalLength_, needCoreNum_);
|
||||
|
||||
int32_t lastCoreElements = totalLength_ - (needCoreNum_ - 1) * perCoreElements_;
|
||||
if (blockIdx_ == (needCoreNum_ - 1)) {
|
||||
currentCoreElements_ = lastCoreElements;
|
||||
} else {
|
||||
currentCoreElements_ = perCoreElements_;
|
||||
}
|
||||
|
||||
// expertIdxOffset_
|
||||
expertIdxOffset_ = AlignBytes(currentCoreElements_, sizeof(int32_t));
|
||||
expertIndexOffset_ = expertIdxOffset_;
|
||||
|
||||
gatherMaskOffset_ = expertIdxOffset_ * kvFactor;
|
||||
int64_t maskOffset =
|
||||
AlignBytes(Ceil(currentCoreElements_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE, sizeof(int8_t));
|
||||
compareScalarMask0Offset_ = gatherMaskOffset_ + maskOffset;
|
||||
compareScalarMask1Offset_ = compareScalarMask0Offset_ + maskOffset;
|
||||
int64_t bufferSize = MULTI_GATHERED_MAX_NUM * kvFactor * kvFactor * kvFactor * sizeof(int32_t);
|
||||
pipe_->InitBuffer(sortedNumCopyOutQueue_, 1, AlignBytes(1, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(buffer_, bufferSize); // 73728 Bytes
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeGatherSortMultiCore::Process()
|
||||
{
|
||||
if (blockIdx_ < PARALLEL_GATHERED_SORT_NEED_CORE_NUM) {
|
||||
CopyIn();
|
||||
Compute();
|
||||
CopyOut();
|
||||
}
|
||||
SyncAll();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_GATHER_SORT_MULTI_CORE_H
|
||||
@@ -0,0 +1,329 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_gather_quant.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_GATHER_STATIC_QUANT_H
|
||||
#define MOE_CUSTOM_GATHER_STATIC_QUANT_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
constexpr int64_t GATHER_OUT_QUANT_BUFFER_NUM = 2;
|
||||
|
||||
template <typename T, const int EP>
|
||||
class MoeGatherOutQuant {
|
||||
public:
|
||||
__aicore__ inline MoeGatherOutQuant(){};
|
||||
__aicore__ inline void Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expandedX, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
|
||||
TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyExpertIn(int64_t progress);
|
||||
__aicore__ inline void Compute(int64_t curLoopCols);
|
||||
__aicore__ inline void CopyXIn(int64_t xSrcOffset, int64_t curLoopCols);
|
||||
__aicore__ inline void CopyXOut(int64_t xDstOffset, int64_t curLoopCols);
|
||||
__aicore__ inline void ScatterCopyOut(int64_t progress);
|
||||
__aicore__ inline void GatherCopyOut(int64_t progress);
|
||||
|
||||
private:
|
||||
TPipe *pipe_;
|
||||
TQue<QuePosition::VECIN, GATHER_OUT_QUANT_BUFFER_NUM> inputXCopyInQueue_;
|
||||
TQue<QuePosition::VECIN, GATHER_OUT_QUANT_BUFFER_NUM> expandRowIdxCopyInQueue_;
|
||||
TQue<QuePosition::VECOUT, GATHER_OUT_QUANT_BUFFER_NUM> inputXCopyOutQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> floatQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> halfQueue_;
|
||||
|
||||
GlobalTensor<T> inputXGm_;
|
||||
GlobalTensor<int8_t> expandedXGm_;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<float> scaleGm_;
|
||||
GlobalTensor<float> offsetGm_;
|
||||
GlobalTensor<int32_t> expertTotalCountGm_;
|
||||
|
||||
const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
|
||||
|
||||
int64_t needCoreNum_;
|
||||
int64_t blockIdx_;
|
||||
int64_t cols_;
|
||||
int64_t n_;
|
||||
int64_t k_;
|
||||
int64_t perCoreRow_;
|
||||
int64_t currentLoopRows_;
|
||||
int64_t coreRows_;
|
||||
int64_t perLoopRows_;
|
||||
int64_t lastLoopRows_;
|
||||
int64_t rowLoops_;
|
||||
int64_t colsTileLength_;
|
||||
int64_t perLoopCols_;
|
||||
int64_t lastLoopCols_;
|
||||
int64_t colLoops_;
|
||||
float scale_;
|
||||
float offset_;
|
||||
int64_t rowIdxType_;
|
||||
int64_t dropPadMode_;
|
||||
int64_t activeNum_;
|
||||
int64_t indicesOffset_;
|
||||
int64_t coreNum_;
|
||||
int64_t inputOffset_;
|
||||
int64_t outOffset_;
|
||||
int64_t expertTotalCount_;
|
||||
};
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset,
|
||||
GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
pipe_ = tPipe;
|
||||
blockIdx_ = GetBlockIdx();
|
||||
|
||||
gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
|
||||
cols_ = tilingData->cols;
|
||||
n_ = tilingData->n;
|
||||
k_ = tilingData->k;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
dropPadMode_ = tilingData->dropPadMode;
|
||||
activeNum_ = tilingData->activeNum;
|
||||
coreNum_ = tilingData->coreNum;
|
||||
|
||||
// core split
|
||||
int64_t actualExpertNum_ = tilingData->actualExpertNum;
|
||||
|
||||
if constexpr (EP) {
|
||||
expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) * 2 +
|
||||
Align(actualExpertNum_, sizeof(int32_t)),
|
||||
1);
|
||||
AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
|
||||
AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
|
||||
expertTotalCount_ = expertTotalCountGm_.GetValue(0);
|
||||
} else {
|
||||
expertTotalCount_ = n_ * k_;
|
||||
}
|
||||
|
||||
perCoreRow_ = Ceil(expertTotalCount_, tilingData->coreNum);
|
||||
needCoreNum_ = Ceil(expertTotalCount_, perCoreRow_);
|
||||
int64_t lastCoreIndicesElements_ = expertTotalCount_ - (needCoreNum_ - 1) * perCoreRow_;
|
||||
|
||||
// inner core split
|
||||
int64_t originPerLoopElements;
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
coreRows_ = lastCoreIndicesElements_;
|
||||
originPerLoopElements = gatherOutTilingData_->lastCorePerLoopIndicesElements;
|
||||
} else {
|
||||
coreRows_ = perCoreRow_;
|
||||
originPerLoopElements = gatherOutTilingData_->perCorePerLoopIndicesElements;
|
||||
}
|
||||
perLoopRows_ = Min(coreRows_, originPerLoopElements);
|
||||
rowLoops_ = Ceil(coreRows_, perLoopRows_);
|
||||
lastLoopRows_ = coreRows_ - (rowLoops_ - 1) * perLoopRows_;
|
||||
|
||||
// cols split
|
||||
perLoopCols_ = gatherOutTilingData_->perLoopCols;
|
||||
lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
|
||||
colLoops_ = gatherOutTilingData_->colsLoops;
|
||||
|
||||
inputXGm_.SetGlobalBuffer((__gm__ T *)inputX);
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
|
||||
|
||||
if constexpr (EP) {
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
|
||||
Align(coreRows_, sizeof(int32_t)));
|
||||
} else {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
|
||||
blockIdx_ * perCoreRow_,
|
||||
Align(coreRows_, sizeof(int32_t)));
|
||||
}
|
||||
} else {
|
||||
if (rowIdxType_ == GATHER) {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
|
||||
Align(coreRows_, sizeof(int32_t)));
|
||||
} else {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
|
||||
blockIdx_ * perCoreRow_,
|
||||
Align(coreRows_, sizeof(int32_t)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
scaleGm_.SetGlobalBuffer((__gm__ float *)scale, 1);
|
||||
offsetGm_.SetGlobalBuffer((__gm__ float *)offset, 1);
|
||||
scale_ = scaleGm_.GetValue(0);
|
||||
offset_ = offsetGm_.GetValue(0);
|
||||
|
||||
pipe_->InitBuffer(inputXCopyInQueue_, GATHER_OUT_QUANT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
|
||||
pipe_->InitBuffer(inputXCopyOutQueue_, GATHER_OUT_QUANT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(int8_t)));
|
||||
pipe_->InitBuffer(expandRowIdxCopyInQueue_, GATHER_OUT_QUANT_BUFFER_NUM, AlignBytes(perLoopRows_, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(floatQueue_, 1, AlignBytes(perLoopCols_, sizeof(float)));
|
||||
pipe_->InitBuffer(halfQueue_, 1, AlignBytes(perLoopCols_, sizeof(half)));
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::CopyExpertIn(int64_t progress)
|
||||
{
|
||||
indicesOffset_ = progress * perLoopRows_;
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
|
||||
expandRowIdxCopyInQueue_.EnQue<int32_t>(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::CopyXIn(int64_t xSrcOffset, int64_t curLoopCols)
|
||||
{
|
||||
LocalTensor<T> inLocal = inputXCopyInQueue_.AllocTensor<T>();
|
||||
DataCopyExtParams copyParams0{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> padParams0{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal, inputXGm_[xSrcOffset], copyParams0, padParams0);
|
||||
inputXCopyInQueue_.EnQue(inLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::CopyXOut(int64_t xDstOffset, int64_t curLoopCols)
|
||||
{
|
||||
LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.DeQue<int8_t>();
|
||||
DataCopyExtParams copyParams2{1, static_cast<uint32_t>(curLoopCols * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyPad(expandedXGm_[xDstOffset], outLocal, copyParams2);
|
||||
inputXCopyOutQueue_.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::Compute(int64_t curLoopCols)
|
||||
{
|
||||
LocalTensor<float> floatLocal;
|
||||
LocalTensor<T> inLocal;
|
||||
LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.AllocTensor<int8_t>();
|
||||
LocalTensor<half> halfLocal = halfQueue_.AllocTensor<half>();
|
||||
uint32_t elements = Align(curLoopCols, sizeof(T));
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
floatLocal = inputXCopyInQueue_.DeQue<float>();
|
||||
} else {
|
||||
inLocal = inputXCopyInQueue_.DeQue<T>();
|
||||
floatLocal = floatQueue_.AllocTensor<float>();
|
||||
Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
Muls(floatLocal, floatLocal, scale_, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Adds(floatLocal, floatLocal, offset_, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
|
||||
Cast(intLocal, floatLocal, RoundMode::CAST_RINT, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(halfLocal, intLocal, RoundMode::CAST_ROUND, elements);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, halfLocal, RoundMode::CAST_TRUNC, elements);
|
||||
inputXCopyOutQueue_.EnQue(outLocal);
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
inputXCopyInQueue_.FreeTensor(floatLocal);
|
||||
} else {
|
||||
inputXCopyInQueue_.FreeTensor(inLocal);
|
||||
floatQueue_.FreeTensor(floatLocal);
|
||||
}
|
||||
halfQueue_.FreeTensor(halfLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::ScatterCopyOut(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.DeQue<int32_t>();
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
for (int64_t indicesIndex = 0; indicesIndex < currentLoopRows_; indicesIndex++) {
|
||||
int64_t rowOffset = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
|
||||
int64_t rowIdx = indicesLocal.GetValue(indicesIndex);
|
||||
int64_t xSrcOffset = rowIdx / k_ * cols_;
|
||||
int64_t xDstOffset = (rowOffset + indicesIndex) * cols_;
|
||||
int64_t curLoopCols = perLoopCols_;
|
||||
if (activeNum_ > 0 && dropPadMode_ == DROPLESS_MODE && (rowOffset + indicesIndex) >= activeNum_) {
|
||||
break;
|
||||
}
|
||||
SetWaitFlag<HardEvent::S_MTE2>(HardEvent::S_MTE2);
|
||||
for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
|
||||
if (colsLoop == colLoops_ - 1) {
|
||||
curLoopCols = lastLoopCols_;
|
||||
}
|
||||
int64_t colsLoopOffset = colsLoop * perLoopCols_;
|
||||
CopyXIn(xSrcOffset + colsLoopOffset, curLoopCols);
|
||||
Compute(curLoopCols);
|
||||
CopyXOut(xDstOffset + colsLoopOffset, curLoopCols);
|
||||
}
|
||||
}
|
||||
expandRowIdxCopyInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::GatherCopyOut(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.DeQue<int32_t>();
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
colsTileLength_ = perLoopCols_;
|
||||
for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
|
||||
int64_t initialRow = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
|
||||
int64_t curLoopRow = 0;
|
||||
if (colsLoop == colLoops_ - 1) {
|
||||
colsTileLength_ = lastLoopCols_;
|
||||
}
|
||||
int64_t currentLoopStartRow = initialRow / k_;
|
||||
int64_t currentLoopLastRow = (initialRow + currentLoopRows_ - 1) / k_;
|
||||
for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
|
||||
inputOffset_ = row * cols_ + colsLoop * perLoopCols_;
|
||||
// input row position
|
||||
CopyXIn(inputOffset_, colsTileLength_);
|
||||
Compute(colsTileLength_);
|
||||
LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.DeQue<int8_t>();
|
||||
DataCopyExtParams intriParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(int8_t)), 0, 0, 0};
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
while (curLoopRow < currentLoopRows_ && initialRow / k_ == row) {
|
||||
int32_t outIndex = indicesLocal.GetValue(curLoopRow);
|
||||
curLoopRow++;
|
||||
initialRow++;
|
||||
if (outIndex == -1 || (dropPadMode_ == DROPLESS_MODE && outIndex >= activeNum_)) {
|
||||
continue;
|
||||
}
|
||||
outOffset_ = outIndex * cols_ + colsLoop * perLoopCols_;
|
||||
DataCopyPad(expandedXGm_[outOffset_], outLocal, intriParams);
|
||||
}
|
||||
inputXCopyOutQueue_.FreeTensor(outLocal);
|
||||
}
|
||||
}
|
||||
expandRowIdxCopyInQueue_.FreeTensor(indicesLocal);
|
||||
}
|
||||
|
||||
template <typename T, const int EP>
|
||||
__aicore__ inline void MoeGatherOutQuant<T, EP>::Process()
|
||||
{
|
||||
if (blockIdx_ < needCoreNum_) {
|
||||
currentLoopRows_ = perLoopRows_;
|
||||
for (int64_t loop = 0; loop < rowLoops_; loop++) {
|
||||
if (loop == rowLoops_ - 1) {
|
||||
currentLoopRows_ = lastLoopRows_;
|
||||
}
|
||||
CopyExpertIn(loop);
|
||||
if constexpr (EP) {
|
||||
ScatterCopyOut(loop);
|
||||
} else {
|
||||
GatherCopyOut(loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_GATHER_STATIC_QUANT_H
|
||||
207
csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort.h
Normal file
207
csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort.h
Normal file
@@ -0,0 +1,207 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_mrgsort.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_MRGSORT_H
|
||||
#define MOE_CUSTOM_MRGSORT_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
struct MoeMrgsortParam {
|
||||
int64_t perListElements;
|
||||
int64_t lastListElements;
|
||||
int64_t oneLoopMaxElements;
|
||||
};
|
||||
|
||||
class MoeMrgsort {
|
||||
public:
|
||||
__aicore__ inline MoeMrgsort(){};
|
||||
__aicore__ inline void Init(MoeMrgsortParam *param);
|
||||
__aicore__ inline void Process();
|
||||
__aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput);
|
||||
__aicore__ inline void SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput);
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void UpdateMrgParam();
|
||||
__aicore__ inline void MrgsortCompute();
|
||||
__aicore__ inline void UpdateSortInfo();
|
||||
__aicore__ inline void CopyOut();
|
||||
__aicore__ inline void ClearCache();
|
||||
|
||||
private:
|
||||
MoeMrgsortParam *param = nullptr;
|
||||
|
||||
GlobalTensor<float> gmInputs[4];
|
||||
GlobalTensor<float> gmOutput;
|
||||
|
||||
LocalTensor<float> ubInputs[4];
|
||||
LocalTensor<float> ubOutput;
|
||||
|
||||
int64_t listNum{0};
|
||||
int64_t remainListNum{0};
|
||||
int64_t outOffset{0};
|
||||
int64_t offsets[4];
|
||||
int64_t listRemainElements[4];
|
||||
int64_t lengths[4];
|
||||
int64_t allRemainElements{0};
|
||||
int64_t curLoopSortedNum{0};
|
||||
|
||||
// for MrgSort
|
||||
uint16_t validBitTail{0};
|
||||
uint16_t elementCountListTail[4];
|
||||
uint32_t listSortedNums[4];
|
||||
LocalTensor<float> tmpUbInputs[4];
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeMrgsort::ClearCache()
|
||||
{
|
||||
this->listNum = 0;
|
||||
this->allRemainElements = 0;
|
||||
this->outOffset = 0;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput)
|
||||
{
|
||||
this->gmInputs[listNum] = gmInput;
|
||||
this->ubInputs[listNum] = ubInput;
|
||||
this->listNum += 1;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput)
|
||||
{
|
||||
this->gmOutput = gmOutput;
|
||||
this->ubOutput = ubOutput;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::UpdateMrgParam()
|
||||
{
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0011;
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0111;
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
validBitTail = 0b1111;
|
||||
} else {
|
||||
validBitTail = 0b0001;
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::CopyIn()
|
||||
{
|
||||
this->remainListNum = 0;
|
||||
event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
|
||||
SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
|
||||
if (lengths[i] > 0) {
|
||||
DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
|
||||
Align(GetSortLen<float>(lengths[i]), sizeof(float)));
|
||||
tmpUbInputs[j] = this->ubInputs[i];
|
||||
elementCountListTail[j] = lengths[i];
|
||||
this->remainListNum += 1;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::MrgsortCompute()
|
||||
{
|
||||
event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
|
||||
SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
MrgSortSrcList sortListTail =
|
||||
MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
|
||||
tmpUbInputs[MERGE_LIST_IDX_THREE]);
|
||||
MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else {
|
||||
DataCopy(this->ubOutput, this->tmpUbInputs[0],
|
||||
Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
|
||||
listSortedNums[0] = elementCountListTail[0];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::UpdateSortInfo()
|
||||
{
|
||||
curLoopSortedNum = 0;
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
if (lengths[i] > 0) {
|
||||
// update remain size
|
||||
listRemainElements[i] -= listSortedNums[j];
|
||||
allRemainElements -= listSortedNums[j];
|
||||
// update offset
|
||||
offsets[i] += GetSortOffset<float>(listSortedNums[j]);
|
||||
// update current loop sorted nums
|
||||
curLoopSortedNum += listSortedNums[j];
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::CopyOut()
|
||||
{
|
||||
DataCopyParams intriParams;
|
||||
intriParams.blockCount = 1;
|
||||
intriParams.blockLen = GetSortLen<float>(curLoopSortedNum) * sizeof(float);
|
||||
event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
|
||||
SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
DataCopyPad(this->gmOutput[outOffset], this->ubOutput, intriParams);
|
||||
outOffset += GetSortLen<float>(curLoopSortedNum);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::Init(MoeMrgsortParam *param)
|
||||
{
|
||||
this->param = param;
|
||||
this->remainListNum = listNum;
|
||||
|
||||
for (int64_t i = 0; i < listNum; i++) {
|
||||
offsets[i] = GetSortOffset<float>(param->perListElements * i);
|
||||
if (i == listNum - 1) {
|
||||
listRemainElements[i] = param->lastListElements;
|
||||
} else {
|
||||
listRemainElements[i] = param->perListElements;
|
||||
}
|
||||
allRemainElements += listRemainElements[i];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsort::Process()
|
||||
{
|
||||
for (; allRemainElements > 0;) {
|
||||
CopyIn();
|
||||
UpdateMrgParam();
|
||||
MrgsortCompute();
|
||||
UpdateSortInfo();
|
||||
CopyOut();
|
||||
}
|
||||
|
||||
ClearCache();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_MRGSORT_H
|
||||
232
csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_out.h
Normal file
232
csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_out.h
Normal file
@@ -0,0 +1,232 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_mrgsort_out.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_MRGSORT_OUT_H
|
||||
#define MOE_CUSTOM_MRGSORT_OUT_H
|
||||
|
||||
#include "moe_custom_mrgsort.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
class MoeMrgsortOut {
|
||||
public:
|
||||
__aicore__ inline MoeMrgsortOut(){};
|
||||
__aicore__ inline void Init(MoeMrgsortParam *param, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
__aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput);
|
||||
__aicore__ inline void SetOutput(GlobalTensor<int32_t> &gmOutput1, GlobalTensor<int32_t> &gmOutput2,
|
||||
LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2);
|
||||
__aicore__ inline void SetBuffer(LocalTensor<float> &tempBuffer);
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void UpdateMrgParam();
|
||||
__aicore__ inline void MrgsortCompute();
|
||||
__aicore__ inline void UpdateSortInfo();
|
||||
__aicore__ inline void Extract();
|
||||
__aicore__ inline void CopyOut();
|
||||
__aicore__ inline void ClearCache();
|
||||
|
||||
private:
|
||||
MoeMrgsortParam *param = nullptr;
|
||||
|
||||
GlobalTensor<float> gmInputs[4];
|
||||
GlobalTensor<int32_t> gmOutput1;
|
||||
GlobalTensor<int32_t> gmOutput2;
|
||||
|
||||
LocalTensor<float> ubInputs[4];
|
||||
LocalTensor<float> tempBuffer;
|
||||
|
||||
// for extract
|
||||
LocalTensor<float> ubOutput1;
|
||||
LocalTensor<uint32_t> ubOutput2;
|
||||
|
||||
// for copy out
|
||||
LocalTensor<int32_t> ubOutputInt1;
|
||||
LocalTensor<int32_t> ubOutputInt2;
|
||||
|
||||
int64_t listNum{0};
|
||||
int64_t remainListNum{0};
|
||||
int64_t outOffset{0};
|
||||
int64_t offsets[4];
|
||||
int64_t listRemainElements[4];
|
||||
int64_t lengths[4];
|
||||
int64_t allRemainElements{0};
|
||||
int64_t curLoopSortedNum{0};
|
||||
|
||||
// for MrgSort
|
||||
uint16_t validBitTail;
|
||||
uint16_t elementCountListTail[4];
|
||||
uint32_t listSortedNums[4];
|
||||
LocalTensor<float> tmpUbInputs[4];
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::ClearCache()
|
||||
{
|
||||
this->listNum = 0;
|
||||
this->allRemainElements = 0;
|
||||
this->outOffset = 0;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput)
|
||||
{
|
||||
this->gmInputs[listNum] = gmInput;
|
||||
this->ubInputs[listNum] = ubInput;
|
||||
this->listNum += 1;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::SetOutput(GlobalTensor<int32_t> &gmOutput1, GlobalTensor<int32_t> &gmOutput2,
|
||||
LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2)
|
||||
{
|
||||
this->gmOutput1 = gmOutput1;
|
||||
this->ubOutput1 = ubOutput1;
|
||||
this->ubOutputInt1 = ubOutput1.ReinterpretCast<int32_t>();
|
||||
|
||||
this->gmOutput2 = gmOutput2;
|
||||
this->ubOutput2 = ubOutput2.ReinterpretCast<uint32_t>();
|
||||
this->ubOutputInt2 = ubOutput2.ReinterpretCast<int32_t>();
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::SetBuffer(LocalTensor<float> &tempBuffer)
|
||||
{
|
||||
this->tempBuffer = tempBuffer;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::UpdateMrgParam()
|
||||
{
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0011;
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0111;
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
validBitTail = 0b1111;
|
||||
} else {
|
||||
validBitTail = 0b0001;
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::CopyIn()
|
||||
{
|
||||
this->remainListNum = 0;
|
||||
event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
|
||||
SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
|
||||
if (lengths[i] > 0) {
|
||||
DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
|
||||
Align(GetSortLen<float>(lengths[i]), sizeof(float)));
|
||||
tmpUbInputs[j] = this->ubInputs[i];
|
||||
elementCountListTail[j] = lengths[i];
|
||||
this->remainListNum += 1;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::MrgsortCompute()
|
||||
{
|
||||
event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
|
||||
SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
MrgSortSrcList sortListTail =
|
||||
MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
|
||||
tmpUbInputs[MERGE_LIST_IDX_THREE]);
|
||||
MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else {
|
||||
DataCopy(this->tempBuffer, this->tmpUbInputs[0],
|
||||
Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
|
||||
listSortedNums[0] = elementCountListTail[0];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::UpdateSortInfo()
|
||||
{
|
||||
curLoopSortedNum = 0;
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
if (lengths[i] > 0) {
|
||||
// update remain size
|
||||
listRemainElements[i] -= listSortedNums[j];
|
||||
allRemainElements -= listSortedNums[j];
|
||||
// update offset
|
||||
offsets[i] += GetSortOffset<float>(listSortedNums[j]);
|
||||
// update current loop sorted nums
|
||||
curLoopSortedNum += listSortedNums[j];
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::Extract()
|
||||
{
|
||||
AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
|
||||
Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
|
||||
Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::CopyOut()
|
||||
{
|
||||
DataCopyParams intriParams;
|
||||
intriParams.blockCount = 1;
|
||||
intriParams.blockLen = curLoopSortedNum * sizeof(int32_t);
|
||||
event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
|
||||
SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
DataCopyPad(this->gmOutput1[outOffset], this->ubOutputInt1, intriParams);
|
||||
DataCopyPad(this->gmOutput2[outOffset], this->ubOutputInt2, intriParams);
|
||||
|
||||
outOffset += curLoopSortedNum;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::Init(MoeMrgsortParam *param, TPipe *tPipe)
|
||||
{
|
||||
this->param = param;
|
||||
this->allRemainElements = 0;
|
||||
for (int64_t i = 0; i < listNum; i++) {
|
||||
offsets[i] = GetSortOffset<float>(param->perListElements * i);
|
||||
if (i == listNum - 1) {
|
||||
listRemainElements[i] = param->lastListElements;
|
||||
} else {
|
||||
listRemainElements[i] = param->perListElements;
|
||||
}
|
||||
allRemainElements += listRemainElements[i];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOut::Process()
|
||||
{
|
||||
for (; allRemainElements > 0;) {
|
||||
CopyIn();
|
||||
UpdateMrgParam();
|
||||
MrgsortCompute();
|
||||
UpdateSortInfo();
|
||||
Extract();
|
||||
CopyOut();
|
||||
}
|
||||
ClearCache();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_MRGSORT_OUT_H
|
||||
@@ -0,0 +1,239 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_mrgsort_out_performance.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_MRGSORT_OUT_PERFORMANCE_H
|
||||
#define MOE_CUSTOM_MRGSORT_OUT_PERFORMANCE_H
|
||||
|
||||
#include "moe_custom_mrgsort_performance.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
constexpr int64_t MAX_MRGSORT_LIST = 4;
|
||||
constexpr int64_t MAX_MRGSORT_LIST_TOTAL = 16;
|
||||
|
||||
class MoeMrgsortOutPerformance {
|
||||
public:
|
||||
__aicore__ inline MoeMrgsortOutPerformance(){};
|
||||
__aicore__ inline void Init(MoeMrgsortPerformanceParam *param, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
__aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
|
||||
GlobalTensor<int32_t> &gmActualSortNum);
|
||||
__aicore__ inline void SetOutput(GlobalTensor<int32_t> &gmOutput1, GlobalTensor<int32_t> &gmOutput2,
|
||||
LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2);
|
||||
__aicore__ inline void SetBuffer(LocalTensor<float> &tempBuffer);
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void UpdateMrgParam();
|
||||
__aicore__ inline void MrgsortCompute();
|
||||
__aicore__ inline void UpdateSortInfo();
|
||||
__aicore__ inline void Extract();
|
||||
__aicore__ inline void CopyOut();
|
||||
__aicore__ inline void ClearCache();
|
||||
|
||||
private:
|
||||
MoeMrgsortPerformanceParam *param = nullptr;
|
||||
|
||||
GlobalTensor<float> gmInputs[4];
|
||||
GlobalTensor<int32_t> gmOutput1;
|
||||
GlobalTensor<int32_t> gmOutput2;
|
||||
GlobalTensor<int32_t> gmActualSortNum;
|
||||
|
||||
LocalTensor<float> ubInputs[4];
|
||||
LocalTensor<float> tempBuffer;
|
||||
|
||||
// for extract
|
||||
LocalTensor<float> ubOutput1;
|
||||
LocalTensor<uint32_t> ubOutput2;
|
||||
|
||||
// for copy out
|
||||
LocalTensor<int32_t> ubOutputInt1;
|
||||
LocalTensor<int32_t> ubOutputInt2;
|
||||
|
||||
int64_t listNum{0};
|
||||
int64_t remainListNum{0};
|
||||
int64_t outOffset{0};
|
||||
int64_t offsets[4] = {0};
|
||||
int64_t listRemainElements[4] = {0};
|
||||
int64_t lengths[4] = {0};
|
||||
int64_t allRemainElements{0};
|
||||
int64_t curLoopSortedNum{0};
|
||||
|
||||
// for MrgSort
|
||||
uint16_t validBitTail;
|
||||
uint16_t elementCountListTail[4] = {0};
|
||||
uint32_t listSortedNums[4] = {0};
|
||||
LocalTensor<float> tmpUbInputs[4];
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::ClearCache()
|
||||
{
|
||||
this->listNum = 0;
|
||||
this->allRemainElements = 0;
|
||||
this->outOffset = 0;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
|
||||
GlobalTensor<int32_t> &gmActualSortNum)
|
||||
{
|
||||
if (this->listNum == 0) {
|
||||
this->gmActualSortNum = gmActualSortNum;
|
||||
}
|
||||
this->gmInputs[listNum] = gmInput;
|
||||
this->ubInputs[listNum] = ubInput;
|
||||
this->listNum += 1;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::SetOutput(GlobalTensor<int32_t> &gmOutput1,
|
||||
GlobalTensor<int32_t> &gmOutput2,
|
||||
LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2)
|
||||
{
|
||||
this->gmOutput1 = gmOutput1;
|
||||
this->ubOutput1 = ubOutput1;
|
||||
this->ubOutputInt1 = ubOutput1.ReinterpretCast<int32_t>();
|
||||
|
||||
this->gmOutput2 = gmOutput2;
|
||||
this->ubOutput2 = ubOutput2.ReinterpretCast<uint32_t>();
|
||||
this->ubOutputInt2 = ubOutput2.ReinterpretCast<int32_t>();
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::SetBuffer(LocalTensor<float> &tempBuffer)
|
||||
{
|
||||
this->tempBuffer = tempBuffer;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::UpdateMrgParam()
|
||||
{
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0011;
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0111;
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
validBitTail = 0b1111;
|
||||
} else {
|
||||
validBitTail = 0b0001;
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::CopyIn()
|
||||
{
|
||||
this->remainListNum = 0;
|
||||
event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
|
||||
SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
|
||||
if (lengths[i] > 0) {
|
||||
DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
|
||||
Align(GetSortLen<float>(lengths[i]), sizeof(float)));
|
||||
tmpUbInputs[j] = this->ubInputs[i];
|
||||
elementCountListTail[j] = lengths[i];
|
||||
this->remainListNum += 1;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::MrgsortCompute()
|
||||
{
|
||||
event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
|
||||
SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
MrgSortSrcList sortListTail =
|
||||
MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
|
||||
tmpUbInputs[MERGE_LIST_IDX_THREE]);
|
||||
MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else {
|
||||
DataCopy(this->tempBuffer, this->tmpUbInputs[0],
|
||||
Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
|
||||
listSortedNums[0] = elementCountListTail[0];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::UpdateSortInfo()
|
||||
{
|
||||
curLoopSortedNum = 0;
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
if (lengths[i] > 0) {
|
||||
// update remain size
|
||||
listRemainElements[i] -= listSortedNums[j];
|
||||
allRemainElements -= listSortedNums[j];
|
||||
// update offset
|
||||
offsets[i] += GetSortOffset<float>(listSortedNums[j]);
|
||||
// update current loop sorted nums
|
||||
curLoopSortedNum += listSortedNums[j];
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::Extract()
|
||||
{
|
||||
AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
|
||||
Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
|
||||
Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::CopyOut()
|
||||
{
|
||||
DataCopyParams intriParams;
|
||||
intriParams.blockCount = 1;
|
||||
intriParams.blockLen = curLoopSortedNum * sizeof(int32_t);
|
||||
event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
|
||||
SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
DataCopyPad(this->gmOutput1[outOffset], this->ubOutputInt1, intriParams);
|
||||
DataCopyPad(this->gmOutput2[outOffset], this->ubOutputInt2, intriParams);
|
||||
|
||||
outOffset += curLoopSortedNum;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::Init(MoeMrgsortPerformanceParam *param, TPipe *tPipe)
|
||||
{
|
||||
this->param = param;
|
||||
for (int64_t i = 0; i < MAX_MRGSORT_LIST_TOTAL; i++) {
|
||||
listRemainElements[i / MAX_MRGSORT_LIST] += static_cast<int64_t>(gmActualSortNum.GetValue(i));
|
||||
}
|
||||
for (int64_t i = 0; i < listNum; i++) {
|
||||
offsets[i] = GetSortOffset<float>(param->perListElements * i * MAX_MRGSORT_LIST);
|
||||
allRemainElements += listRemainElements[i];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortOutPerformance::Process()
|
||||
{
|
||||
for (; allRemainElements > 0;) {
|
||||
CopyIn();
|
||||
UpdateMrgParam();
|
||||
MrgsortCompute();
|
||||
UpdateSortInfo();
|
||||
Extract();
|
||||
CopyOut();
|
||||
}
|
||||
ClearCache();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_MRGSORT_OUT_PERFORMANCE_H
|
||||
@@ -0,0 +1,206 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_mrgsort_performance.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_MRGSORT_PERFORMANCE_H
|
||||
#define MOE_CUSTOM_MRGSORT_PERFORMANCE_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
struct MoeMrgsortPerformanceParam {
|
||||
int64_t perListElements;
|
||||
int64_t oneLoopMaxElements;
|
||||
};
|
||||
|
||||
class MoeMrgsortPerformance {
|
||||
public:
|
||||
__aicore__ inline MoeMrgsortPerformance(){};
|
||||
__aicore__ inline void Init(MoeMrgsortPerformanceParam *param);
|
||||
__aicore__ inline void Process();
|
||||
__aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
|
||||
GlobalTensor<int32_t> &gmActualSortNum);
|
||||
__aicore__ inline void SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput);
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void UpdateMrgParam();
|
||||
__aicore__ inline void MrgsortCompute();
|
||||
__aicore__ inline void UpdateSortInfo();
|
||||
__aicore__ inline void CopyOut();
|
||||
__aicore__ inline void ClearCache();
|
||||
|
||||
private:
|
||||
MoeMrgsortPerformanceParam *param = nullptr;
|
||||
|
||||
GlobalTensor<float> gmInputs[4];
|
||||
GlobalTensor<float> gmOutput;
|
||||
GlobalTensor<int32_t> gmActualSortNum;
|
||||
|
||||
LocalTensor<float> ubInputs[4];
|
||||
LocalTensor<float> ubOutput;
|
||||
|
||||
int64_t listNum{0};
|
||||
int64_t remainListNum{0};
|
||||
int64_t outOffset{0};
|
||||
int64_t offsets[4];
|
||||
int64_t listRemainElements[4];
|
||||
int64_t lengths[4];
|
||||
int64_t allRemainElements{0};
|
||||
int64_t curLoopSortedNum{0};
|
||||
|
||||
// for MrgSort
|
||||
uint16_t validBitTail{0};
|
||||
uint16_t elementCountListTail[4];
|
||||
uint32_t listSortedNums[4];
|
||||
LocalTensor<float> tmpUbInputs[4];
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::ClearCache()
|
||||
{
|
||||
this->listNum = 0;
|
||||
this->allRemainElements = 0;
|
||||
this->outOffset = 0;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
|
||||
GlobalTensor<int32_t> &gmActualSortNum)
|
||||
{
|
||||
if (this->listNum == 0) {
|
||||
this->gmActualSortNum = gmActualSortNum;
|
||||
}
|
||||
this->gmInputs[listNum] = gmInput;
|
||||
this->ubInputs[listNum] = ubInput;
|
||||
this->listNum += 1;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput)
|
||||
{
|
||||
this->gmOutput = gmOutput;
|
||||
this->ubOutput = ubOutput;
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::UpdateMrgParam()
|
||||
{
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0011;
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
|
||||
validBitTail = 0b0111;
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
validBitTail = 0b1111;
|
||||
} else {
|
||||
validBitTail = 0b0001;
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::CopyIn()
|
||||
{
|
||||
this->remainListNum = 0;
|
||||
event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
|
||||
SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
|
||||
if (lengths[i] > 0) {
|
||||
DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
|
||||
Align(GetSortLen<float>(lengths[i]), sizeof(float)));
|
||||
tmpUbInputs[j] = this->ubInputs[i];
|
||||
elementCountListTail[j] = lengths[i];
|
||||
this->remainListNum += 1;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::MrgsortCompute()
|
||||
{
|
||||
event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
|
||||
SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
|
||||
if (this->remainListNum == MERGE_LIST_TWO) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_THREE) {
|
||||
MrgSortSrcList sortListTail =
|
||||
MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
|
||||
MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else if (this->remainListNum == MERGE_LIST_FOUR) {
|
||||
MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
|
||||
tmpUbInputs[MERGE_LIST_IDX_THREE]);
|
||||
MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
|
||||
} else {
|
||||
DataCopy(this->ubOutput, this->tmpUbInputs[0],
|
||||
Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
|
||||
listSortedNums[0] = elementCountListTail[0];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::UpdateSortInfo()
|
||||
{
|
||||
curLoopSortedNum = 0;
|
||||
for (int64_t i = 0, j = 0; i < listNum; i++) {
|
||||
if (lengths[i] > 0) {
|
||||
// update remain size
|
||||
listRemainElements[i] -= listSortedNums[j];
|
||||
allRemainElements -= listSortedNums[j];
|
||||
// update offset
|
||||
offsets[i] += GetSortOffset<float>(listSortedNums[j]);
|
||||
// update current loop sorted nums
|
||||
curLoopSortedNum += listSortedNums[j];
|
||||
j += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::CopyOut()
|
||||
{
|
||||
DataCopyParams intriParams;
|
||||
intriParams.blockCount = 1;
|
||||
intriParams.blockLen = GetSortLen<float>(curLoopSortedNum) * sizeof(float);
|
||||
event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
|
||||
SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
|
||||
DataCopyPad(this->gmOutput[outOffset], this->ubOutput, intriParams);
|
||||
outOffset += GetSortLen<float>(curLoopSortedNum);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::Init(MoeMrgsortPerformanceParam *param)
|
||||
{
|
||||
this->param = param;
|
||||
for (int64_t i = 0; i < listNum; i++) {
|
||||
offsets[i] = GetSortOffset<float>(param->perListElements * i);
|
||||
listRemainElements[i] = static_cast<int64_t>(gmActualSortNum.GetValue(i));
|
||||
allRemainElements += listRemainElements[i];
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeMrgsortPerformance::Process()
|
||||
{
|
||||
for (; allRemainElements > 0;) {
|
||||
CopyIn();
|
||||
UpdateMrgParam();
|
||||
MrgsortCompute();
|
||||
UpdateSortInfo();
|
||||
CopyOut();
|
||||
}
|
||||
|
||||
ClearCache();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_MRGSORT_PERFORMANCE_H
|
||||
@@ -0,0 +1,204 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_row_idx_gather.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_ROW_IDX_GATHER_H
|
||||
#define MOE_CUSTOM_ROW_IDX_GATHER_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
class RowIdxGather {
|
||||
public:
|
||||
__aicore__ inline RowIdxGather(){};
|
||||
__aicore__ inline void Init(GM_ADDR expandedRowIdx, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
|
||||
TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn(int64_t loop, int64_t elements);
|
||||
__aicore__ inline void Compute(int64_t loop, int64_t elements);
|
||||
__aicore__ inline void CopyOut(int64_t loop, int64_t elements, GlobalTensor<int32_t> &RowIdxDstGm_);
|
||||
__aicore__ inline void AssistInit();
|
||||
|
||||
private:
|
||||
GlobalTensor<int32_t> expandedRowIdxGm_;
|
||||
GlobalTensor<int32_t> sortedExpertIndicesGm_;
|
||||
GlobalTensor<int64_t> expertTokensCountGm_;
|
||||
GlobalTensor<int32_t> expertTotalCountGm_;
|
||||
GlobalTensor<int32_t> assistGm_;
|
||||
GlobalTensor<int32_t> gatherIndicesGm_;
|
||||
|
||||
TPipe *pipe_;
|
||||
|
||||
TQue<QuePosition::VECIN, 1> sortedExpertIndicesInQueue_;
|
||||
TQue<QuePosition::VECOUT, 1> copyOutQueue_;
|
||||
TBuf<TPosition::VECCALC> assistBuffer_;
|
||||
|
||||
const MoeCustomSrcToDstComputeTilingData *srcToDstComputeTilingData_;
|
||||
int64_t blockIdx_;
|
||||
int64_t needCoreNum_;
|
||||
int64_t perCoreElements_;
|
||||
int64_t actualExpertNum_ = 0;
|
||||
int64_t ep_ = 0;
|
||||
int64_t rowIdxType_ = 0;
|
||||
int64_t expertTotalCount_ = 0;
|
||||
|
||||
int64_t loops_ = 0;
|
||||
int64_t perLoopElements_ = 0;
|
||||
int64_t lastLoopElements_ = 0;
|
||||
};
|
||||
|
||||
__aicore__ inline void RowIdxGather::AssistInit()
|
||||
{
|
||||
LocalTensor<int32_t> assistTensor = assistBuffer_.Get<int32_t>(ASSIST_NUM);
|
||||
DataCopy(assistTensor, assistGm_, ASSIST_NUM);
|
||||
SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
|
||||
Adds(assistTensor, assistTensor, (int32_t)(blockIdx_ * perCoreElements_), ASSIST_NUM);
|
||||
}
|
||||
|
||||
__aicore__ inline void RowIdxGather::Init(GM_ADDR expandedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
pipe_ = tPipe;
|
||||
srcToDstComputeTilingData_ = &(tilingData->srcToDstComputeParamsOp);
|
||||
blockIdx_ = GetBlockIdx();
|
||||
actualExpertNum_ = tilingData->actualExpertNum;
|
||||
ep_ = tilingData->ep;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, actualExpertNum_);
|
||||
|
||||
if (ep_) {
|
||||
expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2 +
|
||||
Align(actualExpertNum_, sizeof(int32_t)),
|
||||
actualExpertNum_);
|
||||
AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
|
||||
AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
|
||||
expertTotalCount_ = expertTotalCountGm_.GetValue(0);
|
||||
} else {
|
||||
expertTotalCount_ = tilingData->n * tilingData->k;
|
||||
}
|
||||
assistGm_.SetGlobalBuffer((__gm__ int32_t *)assist, ASSIST_NUM);
|
||||
perCoreElements_ = Ceil(expertTotalCount_, srcToDstComputeTilingData_->needCoreNum);
|
||||
needCoreNum_ = Ceil(expertTotalCount_, perCoreElements_);
|
||||
|
||||
int64_t lastCoreElements = expertTotalCount_ - (needCoreNum_ - 1) * perCoreElements_;
|
||||
int64_t perCoreLoops = Ceil(perCoreElements_, srcToDstComputeTilingData_->perCorePerLoopElements);
|
||||
int64_t perCorePerLoopElements = Ceil(perCoreElements_, perCoreLoops);
|
||||
int64_t perCoreLastLoopElements = perCoreElements_ - (perCoreLoops - 1) * perCorePerLoopElements;
|
||||
|
||||
int64_t lastCoreLoops = Ceil(lastCoreElements, srcToDstComputeTilingData_->perCorePerLoopElements);
|
||||
int64_t lastCorePerLoopElements = Ceil(lastCoreElements, lastCoreLoops);
|
||||
int64_t lastCoreLastLoopELements = lastCoreElements - (lastCoreLoops - 1) * lastCorePerLoopElements;
|
||||
|
||||
loops_ = perCoreLoops;
|
||||
if (blockIdx_ == needCoreNum_ - 1) {
|
||||
loops_ = lastCoreLoops;
|
||||
perLoopElements_ = lastCorePerLoopElements;
|
||||
lastLoopElements_ = lastCoreLastLoopELements;
|
||||
} else {
|
||||
loops_ = perCoreLoops;
|
||||
perLoopElements_ = perCorePerLoopElements;
|
||||
lastLoopElements_ = perCoreLastLoopElements;
|
||||
}
|
||||
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
sortedExpertIndicesGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreElements_,
|
||||
actualExpertNum_);
|
||||
} else {
|
||||
sortedExpertIndicesGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(tilingData->n * tilingData->k, sizeof(int32_t)) +
|
||||
blockIdx_ * perCoreElements_,
|
||||
actualExpertNum_);
|
||||
}
|
||||
|
||||
if ((ep_ == 0 && rowIdxType_ == SCATTER) && (blockIdx_ < needCoreNum_)) {
|
||||
expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(tilingData->n * tilingData->k, sizeof(int32_t)));
|
||||
}
|
||||
pipe_->InitBuffer(sortedExpertIndicesInQueue_, 1, AlignBytes(perLoopElements_, sizeof(int32_t)));
|
||||
pipe_->InitBuffer(copyOutQueue_, 1, Ceil(perLoopElements_, ASSIST_NUM) * ASSIST_NUM * BLOCK_BYTES);
|
||||
pipe_->InitBuffer(assistBuffer_, ASSIST_NUM * sizeof(int32_t));
|
||||
}
|
||||
|
||||
__aicore__ inline void RowIdxGather::Process()
|
||||
{
|
||||
if (ep_ == 1 && rowIdxType_ == SCATTER) {
|
||||
return;
|
||||
} else {
|
||||
if (blockIdx_ < needCoreNum_) {
|
||||
AssistInit();
|
||||
for (int64_t loop = 0; loop < loops_; loop++) {
|
||||
int64_t elements = perLoopElements_;
|
||||
if (loop == loops_ - 1) {
|
||||
elements = lastLoopElements_;
|
||||
}
|
||||
CopyIn(loop, elements);
|
||||
Compute(loop, elements);
|
||||
CopyOut(loop, elements, expandedRowIdxGm_);
|
||||
}
|
||||
}
|
||||
}
|
||||
AscendC::SyncAll();
|
||||
}
|
||||
|
||||
__aicore__ inline void RowIdxGather::CopyIn(int64_t loop, int64_t elements)
|
||||
{
|
||||
LocalTensor<int32_t> sortedExpertIndicesInLocal = sortedExpertIndicesInQueue_.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(elements * sizeof(int32_t)), 0, 0,
|
||||
0};
|
||||
DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(sortedExpertIndicesInLocal, sortedExpertIndicesGm_[loop * perLoopElements_], dataCopyParams,
|
||||
dataCopyPadParams);
|
||||
sortedExpertIndicesInQueue_.EnQue(sortedExpertIndicesInLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void RowIdxGather::Compute(int64_t loop, int64_t elements)
|
||||
{
|
||||
LocalTensor<int32_t> outLocal = copyOutQueue_.AllocTensor<int32_t>();
|
||||
LocalTensor<int32_t> assistTensor = assistBuffer_.Get<int32_t>(ASSIST_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
int64_t loops = Ceil(elements, ASSIST_INDEX_NUM);
|
||||
for (int64_t i = 0; i < loops; i++) {
|
||||
Adds(outLocal[i * ASSIST_NUM], assistTensor,
|
||||
static_cast<int32_t>(perLoopElements_ * loop + i * ASSIST_INDEX_NUM), ASSIST_NUM);
|
||||
}
|
||||
PipeBarrier<PIPE_V>();
|
||||
copyOutQueue_.EnQue<int32_t>(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void RowIdxGather::CopyOut(int64_t loop, int64_t elements, GlobalTensor<int32_t> &RowIdxDstGm_)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortedExpertIndicesInQueue_.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> outLocal = copyOutQueue_.DeQue<int32_t>();
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
DataCopyParams intriParams;
|
||||
intriParams.blockCount = 1;
|
||||
intriParams.blockLen = sizeof(int32_t);
|
||||
uint32_t outOffset;
|
||||
for (int64_t idx = 0; idx < elements; idx++) {
|
||||
outOffset = inLocal.GetValue(idx);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyPad(RowIdxDstGm_[outOffset], outLocal[idx * INT32_ONE_BLOCK_NUM], intriParams);
|
||||
}
|
||||
|
||||
sortedExpertIndicesInQueue_.FreeTensor(inLocal);
|
||||
copyOutQueue_.FreeTensor(outLocal);
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_ROW_IDX_GATHER_H
|
||||
@@ -0,0 +1,306 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_row_idx_gather_droppad.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_H
|
||||
#define MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
class MoeCustomSrcToDstWithCapacity {
|
||||
public:
|
||||
__aicore__ inline MoeCustomSrcToDstWithCapacity(){};
|
||||
__aicore__ inline void Init(GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const TilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn(int64_t progress);
|
||||
__aicore__ inline void CopyOut(int64_t progress);
|
||||
__aicore__ inline void CopyOutRemain();
|
||||
__aicore__ inline void SyncAll();
|
||||
__aicore__ inline void AssistInit();
|
||||
|
||||
private:
|
||||
TPipe *pipe;
|
||||
TQue<QuePosition::VECIN, 1> copyInQueue;
|
||||
TQue<QuePosition::VECOUT, 1> copyOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> copyOutZeroQueue;
|
||||
TQue<QuePosition::VECOUT, 1> scaleOutZeroQueue;
|
||||
|
||||
GlobalTensor<int32_t> expandDstToSrcRowGm;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm;
|
||||
GlobalTensor<int32_t> expertIdxValueGm;
|
||||
GlobalTensor<int32_t> expandedExpertIdxGm;
|
||||
GlobalTensor<T> expandedXGm;
|
||||
GlobalTensor<float> expandedScaleGm;
|
||||
|
||||
LocalTensor<T> outTmpLocal;
|
||||
LocalTensor<float> scaleLocal;
|
||||
|
||||
const MoeCustomSrcToDstCapacityComputeTilingData *srcToDstTilingData;
|
||||
int64_t coreNum;
|
||||
int64_t blockIdx;
|
||||
int64_t totalLength;
|
||||
int64_t currentLoopRows;
|
||||
int64_t coreRows;
|
||||
int64_t perLoopRows;
|
||||
int64_t lastLoopRows;
|
||||
int64_t rowLoops;
|
||||
int64_t expertCapacity;
|
||||
int64_t expertNum;
|
||||
int64_t cols;
|
||||
int64_t perLoopCols;
|
||||
int64_t lastLoopCols;
|
||||
int64_t colLoops;
|
||||
int64_t isInputScale_;
|
||||
int64_t quantMode_;
|
||||
|
||||
int64_t tokenCount = 0;
|
||||
int32_t lastExpertId = -1;
|
||||
int32_t lastCoreExpertId = 0;
|
||||
int32_t lastCoreExpertIdNum = 0;
|
||||
bool needScaleCopy = false;
|
||||
};
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::AssistInit()
|
||||
{
|
||||
if constexpr (IsSameType<T, int8_t>::value) {
|
||||
LocalTensor<int16_t> outLocal = copyOutZeroQueue.AllocTensor<int16_t>();
|
||||
Duplicate<int16_t>(outLocal, static_cast<int16_t>(0), this->perLoopCols);
|
||||
copyOutZeroQueue.EnQue<int16_t>(outLocal);
|
||||
} else {
|
||||
LocalTensor<T> outLocal = copyOutZeroQueue.AllocTensor<T>();
|
||||
Duplicate<T>(outLocal, static_cast<T>(0), this->perLoopCols);
|
||||
copyOutZeroQueue.EnQue<T>(outLocal);
|
||||
}
|
||||
if (this->needScaleCopy) {
|
||||
LocalTensor<float> scaleOutLocal = scaleOutZeroQueue.AllocTensor<float>();
|
||||
Duplicate<float>(scaleOutLocal, 0.0f, FP32_ONE_BLOCK_NUM);
|
||||
scaleOutZeroQueue.EnQue<float>(scaleOutLocal);
|
||||
}
|
||||
|
||||
if (this->blockIdx != 0) {
|
||||
this->lastCoreExpertId = expertIdxValueGm.GetValue((this->blockIdx - 1) * 2);
|
||||
this->lastCoreExpertIdNum = expertIdxValueGm.GetValue((this->blockIdx - 1) * 2 + 1);
|
||||
for (int64_t i = this->blockIdx - 2; i >= 0; i--) {
|
||||
int32_t lastExpertIdx = expertIdxValueGm.GetValue(i * 2);
|
||||
if (lastExpertIdx < this->lastCoreExpertId) {
|
||||
break;
|
||||
}
|
||||
int32_t lastExpertNum = expertIdxValueGm.GetValue(i * 2 + 1);
|
||||
this->lastCoreExpertIdNum += lastExpertNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::CopyIn(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = copyInQueue.AllocTensor<int32_t>();
|
||||
int64_t length = Align(currentLoopRows, sizeof(int32_t));
|
||||
DataCopy(inLocal, expandDstToSrcRowGm[progress * perLoopRows], length);
|
||||
DataCopy(inLocal[length], expandedExpertIdxGm[progress * perLoopRows], length);
|
||||
copyInQueue.EnQue<int32_t>(inLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::CopyOut(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = copyInQueue.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
|
||||
int64_t length = Align(currentLoopRows, sizeof(int32_t));
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyExtParams ScaleParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
if (this->lastExpertId == -1) {
|
||||
this->lastExpertId = this->lastCoreExpertId;
|
||||
this->tokenCount = this->lastCoreExpertIdNum;
|
||||
}
|
||||
for (int64_t idx = 0; idx < currentLoopRows; idx++) {
|
||||
int32_t expertIdx = inLocal[length].GetValue(idx);
|
||||
int32_t index = 0;
|
||||
while (this->lastExpertId < expertIdx) {
|
||||
while (this->tokenCount < this->expertCapacity) {
|
||||
index = this->lastExpertId * this->expertCapacity + this->tokenCount;
|
||||
if (this->needScaleCopy) {
|
||||
DataCopyPad(expandedScaleGm[index], this->scaleLocal, ScaleParams);
|
||||
}
|
||||
int64_t col = this->perLoopCols;
|
||||
for (int64_t i = 0; i < this->colLoops; i++) {
|
||||
if (i == this->colLoops - 1) {
|
||||
col = this->lastLoopCols;
|
||||
}
|
||||
DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(T)), 0,
|
||||
0, 0};
|
||||
DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal,
|
||||
copyParams1);
|
||||
}
|
||||
this->tokenCount++;
|
||||
}
|
||||
this->tokenCount = 0;
|
||||
this->lastExpertId++;
|
||||
}
|
||||
|
||||
if (this->tokenCount < this->expertCapacity) {
|
||||
int32_t outOffset = inLocal.GetValue(idx);
|
||||
index = expertIdx * this->expertCapacity + this->tokenCount;
|
||||
outLocal.SetValue(0, index);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyPad(expandedRowIdxGm[outOffset], outLocal, copyParams);
|
||||
this->tokenCount++;
|
||||
}
|
||||
}
|
||||
copyInQueue.FreeTensor(inLocal);
|
||||
copyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::CopyOutRemain()
|
||||
{
|
||||
if (this->blockIdx != this->srcToDstTilingData->needCoreNum - 1) {
|
||||
copyOutZeroQueue.FreeTensor(this->outTmpLocal);
|
||||
if (this->needScaleCopy) {
|
||||
scaleOutZeroQueue.FreeTensor(this->scaleLocal);
|
||||
}
|
||||
return;
|
||||
}
|
||||
DataCopyExtParams ScaleParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
while (this->lastExpertId < this->expertNum) {
|
||||
while (this->tokenCount < this->expertCapacity) {
|
||||
int32_t index = this->lastExpertId * this->expertCapacity + this->tokenCount;
|
||||
if (this->needScaleCopy) {
|
||||
DataCopyPad(expandedScaleGm[index], this->scaleLocal, ScaleParams);
|
||||
}
|
||||
int64_t col = this->perLoopCols;
|
||||
for (int64_t i = 0; i < this->colLoops; i++) {
|
||||
if (i == this->colLoops - 1) {
|
||||
col = this->lastLoopCols;
|
||||
}
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal, copyParams);
|
||||
SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
|
||||
}
|
||||
this->tokenCount++;
|
||||
}
|
||||
this->tokenCount = 0;
|
||||
this->lastExpertId++;
|
||||
}
|
||||
copyOutZeroQueue.FreeTensor(this->outTmpLocal);
|
||||
if (this->needScaleCopy) {
|
||||
scaleOutZeroQueue.FreeTensor(this->scaleLocal);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::SyncAll()
|
||||
{
|
||||
if (coreNum == 1) {
|
||||
return;
|
||||
}
|
||||
#ifndef __CCE_KT_TEST__
|
||||
AscendC::SyncAll();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::Init(GM_ADDR expandedRowIdx, GM_ADDR expandedX,
|
||||
GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const TilingData *tilingData,
|
||||
TPipe *tPipe)
|
||||
{
|
||||
int64_t blockNum = GetBlockNum();
|
||||
this->pipe = tPipe;
|
||||
this->blockIdx = GetBlockIdx();
|
||||
|
||||
this->coreNum = tilingData->coreNum;
|
||||
this->totalLength = tilingData->n * tilingData->k;
|
||||
this->srcToDstTilingData = &(tilingData->srcToDstDropPadParamsOp);
|
||||
this->expertNum = tilingData->expertNum;
|
||||
this->expertCapacity = tilingData->expertCapacity;
|
||||
this->cols = tilingData->cols;
|
||||
this->isInputScale_ = tilingData->isInputScale;
|
||||
this->quantMode_ = tilingData->quantMode;
|
||||
|
||||
if (this->blockIdx == this->srcToDstTilingData->needCoreNum - 1) {
|
||||
this->coreRows = this->srcToDstTilingData->lastCoreRows;
|
||||
this->perLoopRows = this->srcToDstTilingData->lastCorePerLoopRows;
|
||||
this->lastLoopRows = this->srcToDstTilingData->lastCoreLastLoopRows;
|
||||
this->rowLoops = this->srcToDstTilingData->lastCoreLoops;
|
||||
} else {
|
||||
this->coreRows = this->srcToDstTilingData->perCoreRows;
|
||||
this->perLoopRows = this->srcToDstTilingData->perCorePerLoopRows;
|
||||
this->lastLoopRows = this->srcToDstTilingData->perCoreLastLoopRows;
|
||||
this->rowLoops = this->srcToDstTilingData->perCoreLoops;
|
||||
}
|
||||
this->perLoopCols = this->srcToDstTilingData->perLoopCols;
|
||||
this->lastLoopCols = this->srcToDstTilingData->lastLoopCols;
|
||||
this->colLoops = this->srcToDstTilingData->colLoops;
|
||||
this->needScaleCopy = (this->isInputScale_ != 0 && this->quantMode_ == -1);
|
||||
|
||||
expandedScaleGm.SetGlobalBuffer((__gm__ float *)expandedScale);
|
||||
|
||||
int64_t length = Align(this->totalLength, sizeof(int32_t));
|
||||
expandedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, length);
|
||||
expandedXGm.SetGlobalBuffer((__gm__ T *)expandedX, this->expertNum * this->expertCapacity * this->cols);
|
||||
|
||||
expandedExpertIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
this->blockIdx * this->srcToDstTilingData->perCoreRows,
|
||||
Align(this->coreRows, sizeof(int32_t)));
|
||||
expandDstToSrcRowGm.SetGlobalBuffer((__gm__ int32_t *)workspace + length +
|
||||
this->blockIdx * this->srcToDstTilingData->perCoreRows,
|
||||
Align(this->coreRows, sizeof(int32_t)));
|
||||
expertIdxValueGm.SetGlobalBuffer(
|
||||
(__gm__ int32_t *)workspace + length * 2 + Align(this->expertNum, sizeof(int32_t)) * 2, this->coreNum * 2);
|
||||
|
||||
pipe->InitBuffer(copyInQueue, 1, AlignBytes(this->perLoopRows, sizeof(int32_t)) * 2);
|
||||
pipe->InitBuffer(copyOutQueue, 1, AlignBytes(INT32_ONE_BLOCK_NUM, sizeof(int32_t)));
|
||||
if constexpr (IsSameType<T, int8_t>::value) {
|
||||
pipe->InitBuffer(copyOutZeroQueue, 1, AlignBytes(this->perLoopCols, sizeof(int16_t)));
|
||||
} else {
|
||||
pipe->InitBuffer(copyOutZeroQueue, 1, AlignBytes(this->perLoopCols, sizeof(T)));
|
||||
}
|
||||
if (this->needScaleCopy) {
|
||||
pipe->InitBuffer(scaleOutZeroQueue, 1, BLOCK_BYTES);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::Process()
|
||||
{
|
||||
if (this->blockIdx < this->srcToDstTilingData->needCoreNum) {
|
||||
AssistInit();
|
||||
this->outTmpLocal = copyOutZeroQueue.DeQue<T>();
|
||||
if (this->needScaleCopy) {
|
||||
this->scaleLocal = scaleOutZeroQueue.DeQue<float>();
|
||||
}
|
||||
currentLoopRows = perLoopRows;
|
||||
for (int64_t loop = 0; loop < this->rowLoops; loop++) {
|
||||
if (loop == this->rowLoops - 1) {
|
||||
currentLoopRows = lastLoopRows;
|
||||
}
|
||||
CopyIn(loop);
|
||||
CopyOut(loop);
|
||||
}
|
||||
CopyOutRemain();
|
||||
}
|
||||
this->SyncAll();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_H
|
||||
@@ -0,0 +1,582 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_row_idx_gather_droppad_dynamic.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_DYNAMIC_H
|
||||
#define MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_DYNAMIC_H
|
||||
|
||||
#include "moe_custom_common.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
class MoeCustomSrcToDstAndGather {
|
||||
public:
|
||||
__aicore__ inline MoeCustomSrcToDstAndGather(){};
|
||||
__aicore__ inline void Init(GM_ADDR x, GM_ADDR scale, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
|
||||
GM_ADDR dynamicQuantScale, GM_ADDR workspace, const TilingData *tilingData,
|
||||
TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn(int64_t progress);
|
||||
__aicore__ inline void CopyOut(int64_t progress);
|
||||
__aicore__ inline void CopyOutLoops(int64_t progress);
|
||||
__aicore__ inline void Compute(int32_t srcIdx, int32_t dstIdx, int32_t expertIdx);
|
||||
__aicore__ inline float ComputeMax(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
|
||||
LocalTensor<float> &dynamicQuantLocal, int32_t srcIdx, int32_t expertIdx,
|
||||
int64_t j);
|
||||
__aicore__ inline void ComputeScale(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal, float scaleTemp,
|
||||
int64_t dstIndex, int64_t j);
|
||||
__aicore__ inline void ComputeLoops(int32_t srcIdx, int32_t dstIdx, int32_t expertIdx);
|
||||
|
||||
__aicore__ inline void CopyOutRemain();
|
||||
__aicore__ inline void SyncAll();
|
||||
__aicore__ inline void AssistInit();
|
||||
|
||||
private:
|
||||
TPipe *pipe;
|
||||
TQue<QuePosition::VECIN, 1> copyInQueue;
|
||||
TQue<QuePosition::VECOUT, 1> copyOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> copyOutZeroQueue;
|
||||
|
||||
TQue<QuePosition::VECIN, 1> inputXInQueue;
|
||||
TQue<QuePosition::VECIN, 1> smoothInQueue;
|
||||
TQue<QuePosition::VECOUT, 1> calcQueue;
|
||||
TQue<QuePosition::VECOUT, 1> inputXOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> scaleOutQueue;
|
||||
TQue<QuePosition::VECOUT, 1> scaleOutZeroQueue;
|
||||
|
||||
GlobalTensor<int32_t> expandDstToSrcRowGm;
|
||||
GlobalTensor<int32_t> expandedRowIdxGm;
|
||||
GlobalTensor<int32_t> expertIdxValueGm;
|
||||
GlobalTensor<int32_t> expandedExpertIdxGm;
|
||||
GlobalTensor<int8_t> expandedXGm;
|
||||
|
||||
GlobalTensor<T> inputXGm;
|
||||
GlobalTensor<float> quantSmoothGm;
|
||||
GlobalTensor<float> dynamicQuantScaleGm;
|
||||
GlobalTensor<float> quantSrcGm;
|
||||
|
||||
LocalTensor<int8_t> outTmpLocal;
|
||||
LocalTensor<float> scaleOutTmpLocal;
|
||||
LocalTensor<float> smoothLocal;
|
||||
|
||||
const MoeCustomSrcToDstCapacityComputeTilingData *srcToDstTilingData;
|
||||
|
||||
int64_t coreNum;
|
||||
int64_t blockIdx;
|
||||
int64_t totalLength;
|
||||
int64_t currentLoopRows;
|
||||
int64_t coreRows;
|
||||
int64_t perLoopRows;
|
||||
int64_t lastLoopRows;
|
||||
int64_t rowLoops;
|
||||
int64_t expertCapacity;
|
||||
int64_t expertNum;
|
||||
int64_t cols;
|
||||
int64_t perLoopCols;
|
||||
int64_t lastLoopCols;
|
||||
int64_t colLoops;
|
||||
int64_t perLoopColsAlign;
|
||||
int64_t k;
|
||||
int64_t colsTileLength;
|
||||
int64_t smoothType;
|
||||
|
||||
int64_t tokenCount = 0;
|
||||
int32_t lastExpertId = -1;
|
||||
int32_t lastCoreExpertId = 0;
|
||||
int32_t lastCoreExpertIdNum = 0;
|
||||
};
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::AssistInit()
|
||||
{
|
||||
LocalTensor<int16_t> outLocal = copyOutZeroQueue.AllocTensor<int16_t>();
|
||||
Duplicate<int16_t>(outLocal, static_cast<int16_t>(0), this->perLoopCols);
|
||||
copyOutZeroQueue.EnQue<int16_t>(outLocal);
|
||||
LocalTensor<float> scaleOutLocal = scaleOutZeroQueue.AllocTensor<float>();
|
||||
Duplicate<float>(scaleOutLocal, 0.0f, 8);
|
||||
scaleOutZeroQueue.EnQue<float>(scaleOutLocal);
|
||||
|
||||
if (this->blockIdx != 0) {
|
||||
this->lastCoreExpertId = expertIdxValueGm.GetValue((this->blockIdx - 1) * EXPERT_ID_VALUE_NUM);
|
||||
this->lastCoreExpertIdNum = expertIdxValueGm.GetValue((this->blockIdx - 1) * EXPERT_ID_VALUE_NUM + 1);
|
||||
for (int64_t i = this->blockIdx - 2; i >= 0; i--) {
|
||||
int32_t lastExpertIdx = expertIdxValueGm.GetValue(i * EXPERT_ID_VALUE_NUM);
|
||||
if (lastExpertIdx < this->lastCoreExpertId) {
|
||||
break;
|
||||
}
|
||||
int32_t lastExpertNum = expertIdxValueGm.GetValue(i * EXPERT_ID_VALUE_NUM + 1);
|
||||
this->lastCoreExpertIdNum += lastExpertNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyIn(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = copyInQueue.AllocTensor<int32_t>();
|
||||
int64_t length = Align(currentLoopRows, sizeof(int32_t));
|
||||
DataCopy(inLocal, expandDstToSrcRowGm[progress * perLoopRows], length);
|
||||
DataCopy(inLocal[length], expandedExpertIdxGm[progress * perLoopRows], length);
|
||||
|
||||
copyInQueue.EnQue<int32_t>(inLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::Compute(int32_t srcIdx, int32_t dstIdx, int32_t expertIdx)
|
||||
{
|
||||
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(this->cols * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(this->cols * sizeof(int8_t)), 0, 0, 0};
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
LocalTensor<float> inLocal = inputXInQueue.AllocTensor<float>();
|
||||
|
||||
if constexpr (IsSameType<T, float>::value) {
|
||||
DataCopyPad(inLocal, inputXGm[srcIdx / this->k * this->cols], copyInParams, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(inLocal.template ReinterpretCast<T>()[perLoopColsAlign], inputXGm[srcIdx / this->k * this->cols],
|
||||
copyInParams, {false, 0, 0, 0});
|
||||
}
|
||||
|
||||
if (smoothType == SCALE_EH) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm[expertIdx * this->cols], smoothParams, {false, 0, 0, 0});
|
||||
}
|
||||
|
||||
inputXInQueue.EnQue<float>(inLocal);
|
||||
smoothInQueue.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue.DeQue<float>();
|
||||
|
||||
inLocal = inputXInQueue.DeQue<float>();
|
||||
|
||||
LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.template ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (smoothType != NO_SCALE) {
|
||||
Mul(inLocal, inLocal, smoothLocal, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
float maxValue = dynamicQuantLocal.GetValue(0) / MAX_INT8;
|
||||
|
||||
Duplicate<float>(dynamicQuantLocal, maxValue, FP32_ONE_BLOCK_NUM);
|
||||
Duplicate<float>(tempLocal, maxValue, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<int32_t>(), tempLocal, RoundMode::CAST_RINT, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND, this->cols);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, this->cols);
|
||||
|
||||
calcQueue.FreeTensor(tempLocal);
|
||||
inputXOutQueue.EnQue(outLocal);
|
||||
scaleOutQueue.EnQue(dynamicQuantLocal);
|
||||
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
|
||||
DataCopyPad(dynamicQuantScaleGm[dstIdx], quantScaleLocal, quantScaleParams);
|
||||
|
||||
outLocal = inputXOutQueue.DeQue<int8_t>();
|
||||
DataCopyPad(expandedXGm[dstIdx * this->cols], outLocal, copyOutParams);
|
||||
|
||||
inputXInQueue.FreeTensor(inLocal);
|
||||
inputXOutQueue.FreeTensor(outLocal);
|
||||
scaleOutQueue.FreeTensor(quantScaleLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyOut(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = copyInQueue.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
|
||||
int64_t length = Align(currentLoopRows, sizeof(int32_t));
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(this->cols * sizeof(int8_t)), 0, 0,
|
||||
0};
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
if (this->lastExpertId == -1) {
|
||||
this->lastExpertId = this->lastCoreExpertId;
|
||||
this->tokenCount = this->lastCoreExpertIdNum;
|
||||
}
|
||||
for (int64_t idx = 0; idx < currentLoopRows; idx++) {
|
||||
int32_t expertIdx = inLocal[length].GetValue(idx);
|
||||
int32_t index = 0;
|
||||
while (this->lastExpertId < expertIdx) {
|
||||
while (this->tokenCount < this->expertCapacity) {
|
||||
index = this->lastExpertId * this->expertCapacity + this->tokenCount;
|
||||
DataCopyPad(expandedXGm[index * this->cols], this->outTmpLocal, copyParams1);
|
||||
DataCopyPad(dynamicQuantScaleGm[index], this->scaleOutTmpLocal, quantScaleParams);
|
||||
this->tokenCount++;
|
||||
}
|
||||
this->tokenCount = 0;
|
||||
this->lastExpertId++;
|
||||
}
|
||||
|
||||
if (this->tokenCount < this->expertCapacity) {
|
||||
int32_t outOffset = inLocal.GetValue(idx);
|
||||
index = expertIdx * this->expertCapacity + this->tokenCount;
|
||||
outLocal.SetValue(0, index);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyPad(expandedRowIdxGm[outOffset], outLocal, copyParams);
|
||||
Compute(outOffset, index, expertIdx);
|
||||
this->tokenCount++;
|
||||
}
|
||||
}
|
||||
copyInQueue.FreeTensor(inLocal);
|
||||
copyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline float MoeCustomSrcToDstAndGather<T, TilingData>::ComputeMax(LocalTensor<float> &inLocal,
|
||||
LocalTensor<float> &tempLocal,
|
||||
LocalTensor<float> &dynamicQuantLocal,
|
||||
int32_t srcIdx, int32_t expertIdx, int64_t j)
|
||||
{
|
||||
LocalTensor<float> smoothLocal = smoothInQueue.AllocTensor<float>();
|
||||
|
||||
DataCopyExtParams intriParamsT{1, static_cast<uint32_t>(colsTileLength * sizeof(T)), 0, 0, 0};
|
||||
DataCopyExtParams intriParamsFp32{1, static_cast<uint32_t>(colsTileLength * sizeof(float)), 0, 0, 0};
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
DataCopyPad(inLocal.ReinterpretCast<T>()[perLoopColsAlign],
|
||||
inputXGm[srcIdx * this->cols + j * this->perLoopCols], intriParamsT, {false, 0, 0, 0});
|
||||
} else {
|
||||
DataCopyPad(inLocal, inputXGm[srcIdx * this->cols + j * this->perLoopCols], intriParamsT, {false, 0, 0, 0});
|
||||
}
|
||||
|
||||
inputXInQueue.EnQue<float>(inLocal);
|
||||
inLocal = inputXInQueue.DeQue<float>();
|
||||
|
||||
if constexpr (!IsSameType<T, float>::value) {
|
||||
Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
if (smoothType != NO_SCALE) {
|
||||
DataCopyPad(smoothLocal, quantSmoothGm[expertIdx * this->cols + j * this->perLoopCols], intriParamsFp32,
|
||||
{false, 0, 0, 0});
|
||||
smoothInQueue.EnQue(smoothLocal);
|
||||
smoothLocal = smoothInQueue.DeQue<float>();
|
||||
|
||||
Mul(inLocal, inLocal, smoothLocal, colsTileLength);
|
||||
PipeBarrier<PIPE_V>();
|
||||
}
|
||||
|
||||
Abs(tempLocal, inLocal, colsTileLength);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
ReduceMax(dynamicQuantLocal[FP32_ONE_BLOCK_NUM], tempLocal, tempLocal, colsTileLength);
|
||||
|
||||
DataCopyPad(quantSrcGm[j * this->perLoopCols], inLocal, intriParamsFp32);
|
||||
smoothInQueue.FreeTensor(smoothLocal);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
|
||||
return dynamicQuantLocal.GetValue(FP32_ONE_BLOCK_NUM);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::ComputeScale(LocalTensor<float> &inLocal,
|
||||
LocalTensor<float> &tempLocal,
|
||||
float scaleTemp, int64_t dstIndex, int64_t j)
|
||||
{
|
||||
DataCopyExtParams copyInParams{1, static_cast<uint32_t>(colsTileLength * sizeof(float)), 0, 0, 0};
|
||||
DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(colsTileLength * sizeof(int8_t)), 0, 0, 0};
|
||||
|
||||
LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
|
||||
|
||||
DataCopyPad(inLocal, quantSrcGm[j * this->perLoopCols], copyInParams, {false, 0, 0, 0});
|
||||
inputXInQueue.EnQue<float>(inLocal);
|
||||
inLocal = inputXInQueue.DeQue<float>();
|
||||
|
||||
Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Div(tempLocal, inLocal, tempLocal, colsTileLength);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
Cast(tempLocal.ReinterpretCast<int32_t>(), tempLocal, RoundMode::CAST_RINT, colsTileLength);
|
||||
PipeBarrier<PIPE_V>();
|
||||
SetDeqScale((half)1.000000e+00f);
|
||||
Cast(tempLocal.ReinterpretCast<half>(), tempLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
|
||||
colsTileLength);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, colsTileLength);
|
||||
|
||||
inputXOutQueue.EnQue(outLocal);
|
||||
outLocal = inputXOutQueue.DeQue<int8_t>();
|
||||
DataCopyPad(expandedXGm[dstIndex * this->cols + j * this->perLoopCols], outLocal, copyOutParams);
|
||||
|
||||
inputXOutQueue.FreeTensor(outLocal);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::ComputeLoops(int32_t srcIdx, int32_t dstIdx,
|
||||
int32_t expertIdx)
|
||||
{
|
||||
LocalTensor<float> inLocal = inputXInQueue.AllocTensor<float>();
|
||||
LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
|
||||
LocalTensor<float> quantScaleLocal = scaleOutQueue.AllocTensor<float>();
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
uint32_t tmp = 0xFF7FFFFF;
|
||||
float reduceMax = *((float *)&tmp);
|
||||
for (int64_t j = 0; j < this->colLoops; j++) {
|
||||
colsTileLength = this->perLoopCols;
|
||||
if (j == this->colLoops - 1) {
|
||||
colsTileLength = this->lastLoopCols;
|
||||
}
|
||||
float tileMax = ComputeMax(inLocal, tempLocal, quantScaleLocal, srcIdx / this->k, expertIdx, j);
|
||||
reduceMax = (reduceMax > tileMax) ? reduceMax : tileMax;
|
||||
}
|
||||
|
||||
float scaleTemp = reduceMax / 127.0f;
|
||||
Duplicate<float>(quantScaleLocal, scaleTemp, 8);
|
||||
scaleOutQueue.EnQue(quantScaleLocal);
|
||||
quantScaleLocal = scaleOutQueue.DeQue<float>();
|
||||
|
||||
DataCopyPad(dynamicQuantScaleGm[dstIdx], quantScaleLocal, quantScaleParams);
|
||||
|
||||
for (int64_t j = 0; j < this->colLoops; j++) {
|
||||
colsTileLength = this->perLoopCols;
|
||||
if (j == this->colLoops - 1) {
|
||||
colsTileLength = this->lastLoopCols;
|
||||
}
|
||||
ComputeScale(inLocal, tempLocal, scaleTemp, dstIdx, j);
|
||||
}
|
||||
|
||||
inputXInQueue.FreeTensor(inLocal);
|
||||
calcQueue.FreeTensor(tempLocal);
|
||||
scaleOutQueue.FreeTensor(quantScaleLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyOutLoops(int64_t progress)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = copyInQueue.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
|
||||
int64_t length = Align(currentLoopRows, sizeof(int32_t));
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
|
||||
SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
|
||||
if (this->lastExpertId == -1) {
|
||||
this->lastExpertId = this->lastCoreExpertId;
|
||||
this->tokenCount = this->lastCoreExpertIdNum;
|
||||
}
|
||||
for (int64_t idx = 0; idx < currentLoopRows; idx++) {
|
||||
int32_t expertIdx = inLocal[length].GetValue(idx);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
int32_t index = 0;
|
||||
while (this->lastExpertId < expertIdx) {
|
||||
while (this->tokenCount < this->expertCapacity) {
|
||||
index = this->lastExpertId * this->expertCapacity + this->tokenCount;
|
||||
int64_t col = this->perLoopCols;
|
||||
DataCopyPad(dynamicQuantScaleGm[index], this->scaleOutTmpLocal, quantScaleParams);
|
||||
for (int64_t i = 0; i < this->colLoops; i++) {
|
||||
if (i == this->colLoops - 1) {
|
||||
col = this->lastLoopCols;
|
||||
}
|
||||
DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(int8_t)),
|
||||
0, 0, 0};
|
||||
DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal,
|
||||
copyParams1);
|
||||
}
|
||||
this->tokenCount++;
|
||||
}
|
||||
this->tokenCount = 0;
|
||||
this->lastExpertId++;
|
||||
}
|
||||
|
||||
if (this->tokenCount < this->expertCapacity) {
|
||||
int32_t outOffset = inLocal.GetValue(idx);
|
||||
index = expertIdx * this->expertCapacity + this->tokenCount;
|
||||
outLocal.SetValue(0, index);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyPad(expandedRowIdxGm[outOffset], outLocal, copyParams);
|
||||
if (smoothType == SCALE_EH) {
|
||||
ComputeLoops(outOffset, index, expertIdx);
|
||||
} else {
|
||||
ComputeLoops(outOffset, index, 0);
|
||||
}
|
||||
SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
|
||||
this->tokenCount++;
|
||||
}
|
||||
}
|
||||
copyInQueue.FreeTensor(inLocal);
|
||||
copyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyOutRemain()
|
||||
{
|
||||
DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
|
||||
if (this->blockIdx != this->srcToDstTilingData->needCoreNum - 1) {
|
||||
copyOutZeroQueue.FreeTensor(this->outTmpLocal);
|
||||
scaleOutZeroQueue.FreeTensor(this->scaleOutTmpLocal);
|
||||
return;
|
||||
}
|
||||
while (this->lastExpertId < this->expertNum) {
|
||||
while (this->tokenCount < this->expertCapacity) {
|
||||
int32_t index = this->lastExpertId * this->expertCapacity + this->tokenCount;
|
||||
int64_t col = this->perLoopCols;
|
||||
DataCopyPad(dynamicQuantScaleGm[index], this->scaleOutTmpLocal, quantScaleParams);
|
||||
for (int64_t i = 0; i < this->colLoops; i++) {
|
||||
if (i == this->colLoops - 1) {
|
||||
col = this->lastLoopCols;
|
||||
}
|
||||
DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(int8_t)), 0,
|
||||
0, 0};
|
||||
DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal, copyParams);
|
||||
SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
|
||||
}
|
||||
this->tokenCount++;
|
||||
}
|
||||
this->tokenCount = 0;
|
||||
this->lastExpertId++;
|
||||
}
|
||||
copyOutZeroQueue.FreeTensor(this->outTmpLocal);
|
||||
scaleOutZeroQueue.FreeTensor(this->scaleOutTmpLocal);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::Init(GM_ADDR x, GM_ADDR scale, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expandedX, GM_ADDR dynamicQuantScale,
|
||||
GM_ADDR workspace, const TilingData *tilingData,
|
||||
TPipe *tPipe)
|
||||
{
|
||||
int64_t blockNum = GetBlockNum();
|
||||
this->pipe = tPipe;
|
||||
this->blockIdx = GetBlockIdx();
|
||||
|
||||
this->coreNum = tilingData->coreNum;
|
||||
this->totalLength = tilingData->n * tilingData->k;
|
||||
this->srcToDstTilingData = &(tilingData->srcToDstDropPadDynamicParamsOp);
|
||||
this->expertNum = tilingData->expertNum;
|
||||
this->expertCapacity = tilingData->expertCapacity;
|
||||
this->cols = tilingData->cols;
|
||||
this->k = tilingData->k;
|
||||
this->smoothType = tilingData->smoothType;
|
||||
|
||||
if (this->blockIdx == this->srcToDstTilingData->needCoreNum - 1) {
|
||||
this->coreRows = this->srcToDstTilingData->lastCoreRows;
|
||||
this->perLoopRows = this->srcToDstTilingData->lastCorePerLoopRows;
|
||||
this->lastLoopRows = this->srcToDstTilingData->lastCoreLastLoopRows;
|
||||
this->rowLoops = this->srcToDstTilingData->lastCoreLoops;
|
||||
} else {
|
||||
this->coreRows = this->srcToDstTilingData->perCoreRows;
|
||||
this->perLoopRows = this->srcToDstTilingData->perCorePerLoopRows;
|
||||
this->lastLoopRows = this->srcToDstTilingData->perCoreLastLoopRows;
|
||||
this->rowLoops = this->srcToDstTilingData->perCoreLoops;
|
||||
}
|
||||
this->perLoopCols = this->srcToDstTilingData->perLoopCols;
|
||||
this->lastLoopCols = this->srcToDstTilingData->lastLoopCols;
|
||||
this->colLoops = this->srcToDstTilingData->colLoops;
|
||||
this->perLoopColsAlign = Align(this->perLoopCols, sizeof(T));
|
||||
|
||||
inputXGm.SetGlobalBuffer((__gm__ T *)x);
|
||||
quantSmoothGm.SetGlobalBuffer((__gm__ float *)scale);
|
||||
dynamicQuantScaleGm.SetGlobalBuffer((__gm__ float *)dynamicQuantScale);
|
||||
|
||||
int64_t length = Align(this->totalLength, sizeof(int32_t));
|
||||
expandedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, length);
|
||||
expandedXGm.SetGlobalBuffer((__gm__ int8_t *)expandedX, this->expertNum * this->expertCapacity * this->cols);
|
||||
|
||||
expandedExpertIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
this->blockIdx * this->srcToDstTilingData->perCoreRows,
|
||||
Align(this->coreRows, sizeof(int32_t)));
|
||||
expandDstToSrcRowGm.SetGlobalBuffer((__gm__ int32_t *)workspace + length +
|
||||
this->blockIdx * this->srcToDstTilingData->perCoreRows,
|
||||
Align(this->coreRows, sizeof(int32_t)));
|
||||
expertIdxValueGm.SetGlobalBuffer(
|
||||
(__gm__ int32_t *)workspace + length * 2 + Align(this->expertNum, sizeof(int32_t)) * 2, this->coreNum * 2);
|
||||
if (this->colLoops > 1) {
|
||||
quantSrcGm.SetGlobalBuffer((__gm__ float *)workspace + length * 2 +
|
||||
Align(this->expertNum, sizeof(int32_t)) * 2 + this->coreNum * 2 +
|
||||
this->blockIdx * this->cols,
|
||||
this->cols * sizeof(float));
|
||||
}
|
||||
|
||||
pipe->InitBuffer(copyInQueue, 1, AlignBytes(this->perLoopRows, sizeof(int32_t)) * 2);
|
||||
pipe->InitBuffer(copyOutQueue, 1, AlignBytes(INT32_ONE_BLOCK_NUM, sizeof(int32_t)));
|
||||
pipe->InitBuffer(copyOutZeroQueue, 1, AlignBytes(this->perLoopCols, sizeof(int16_t)));
|
||||
|
||||
int64_t perLoopColsAlignBytes = AlignBytes(this->perLoopCols, sizeof(T));
|
||||
perLoopColsAlignBytes =
|
||||
Max(int64_t(perLoopColsAlignBytes * sizeof(float) / sizeof(T)), int64_t(BLOCK_BYTES + BLOCK_BYTES));
|
||||
|
||||
pipe->InitBuffer(inputXInQueue, 1, perLoopColsAlignBytes);
|
||||
pipe->InitBuffer(smoothInQueue, 1, AlignBytes(this->perLoopCols, sizeof(float)));
|
||||
pipe->InitBuffer(calcQueue, 1, AlignBytes(this->perLoopCols, sizeof(float)));
|
||||
pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->perLoopCols, sizeof(int8_t)));
|
||||
pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
|
||||
pipe->InitBuffer(scaleOutZeroQueue, 1, BLOCK_BYTES);
|
||||
}
|
||||
|
||||
template <typename T, typename TilingData>
|
||||
__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::Process()
|
||||
{
|
||||
if (this->blockIdx < this->srcToDstTilingData->needCoreNum) {
|
||||
AssistInit();
|
||||
this->outTmpLocal = copyOutZeroQueue.DeQue<int8_t>();
|
||||
this->scaleOutTmpLocal = scaleOutZeroQueue.DeQue<float>();
|
||||
currentLoopRows = perLoopRows;
|
||||
if (colLoops > 1) {
|
||||
for (int64_t loop = 0; loop < this->rowLoops; loop++) {
|
||||
if (loop == this->rowLoops - 1) {
|
||||
currentLoopRows = lastLoopRows;
|
||||
}
|
||||
CopyIn(loop);
|
||||
CopyOutLoops(loop);
|
||||
}
|
||||
} else {
|
||||
smoothLocal = smoothInQueue.AllocTensor<float>();
|
||||
if (smoothType == SCALE_1H) {
|
||||
DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0};
|
||||
DataCopyPad(smoothLocal, quantSmoothGm, smoothParams, {false, 0, 0, 0});
|
||||
}
|
||||
for (int64_t loop = 0; loop < this->rowLoops; loop++) {
|
||||
if (loop == this->rowLoops - 1) {
|
||||
currentLoopRows = lastLoopRows;
|
||||
}
|
||||
CopyIn(loop);
|
||||
CopyOut(loop);
|
||||
}
|
||||
smoothInQueue.FreeTensor(smoothLocal);
|
||||
}
|
||||
CopyOutRemain();
|
||||
}
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_DYNAMIC_H
|
||||
@@ -0,0 +1,430 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_sort_actual_expert.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_SORT_ACTUAL_EXPERT_H
|
||||
#define MOE_CUSTOM_SORT_ACTUAL_EXPERT_H
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
constexpr int64_t MULTI_GATHERED_SORT_CORE_NUM = 16;
|
||||
constexpr int64_t MULTI_GATHERED_SORT_THRSHOLD = 5632;
|
||||
constexpr int64_t SINGLE_GATHERED_BUFFER_NUM = 2;
|
||||
constexpr int64_t SINGLE_GATHERED_MAX_NUM = 21845;
|
||||
|
||||
template <typename T>
|
||||
class MoeSortActualExpert {
|
||||
public:
|
||||
__aicore__ inline MoeSortActualExpert(){};
|
||||
__aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expendedRowIdx,
|
||||
GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline bool Process();
|
||||
__aicore__ inline void multiCoreGatheredSort();
|
||||
__aicore__ inline void CopyOutExpandRowIdx();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void SortCompute();
|
||||
__aicore__ inline void TilingInKernel();
|
||||
__aicore__ inline void ExpertCountCompute();
|
||||
__aicore__ inline void CopyOut();
|
||||
__aicore__ inline void CopyOutExpertCount();
|
||||
|
||||
private:
|
||||
TPipe *pipe;
|
||||
TBuf<TPosition::VECCALC> buffer_;
|
||||
TQueBind<TPosition::VECIN, TPosition::VECOUT, SINGLE_GATHERED_BUFFER_NUM> scaleCopyInQueue_;
|
||||
TQue<TPosition::VECOUT, 1> sortedNumCopyOutQueue_;
|
||||
|
||||
GlobalTensor<T> xGm_;
|
||||
GlobalTensor<float> scaleGm_;
|
||||
GlobalTensor<T> expandedXGm_;
|
||||
GlobalTensor<int64_t> expertTokensCountOrCumsumGm_;
|
||||
GlobalTensor<float> expandedScaleGm_;
|
||||
GlobalTensor<int32_t> expendedRowIdxGm_;
|
||||
GlobalTensor<int32_t> expertIdxGm_;
|
||||
GlobalTensor<int32_t> workspaceGm_;
|
||||
GlobalTensor<float> workspaceExpertIdxGm_;
|
||||
GlobalTensor<int32_t> workspaceGatheredSortNumGm_;
|
||||
GlobalTensor<float> workspaceGatheredExpertIdxGm_;
|
||||
GlobalTensor<int32_t> workspaceGatheredExpertIndexGm_;
|
||||
|
||||
int64_t expertIdxOffset_ = 0;
|
||||
int64_t expertIndexOffset_ = 0;
|
||||
int64_t compareScalarMaskOffset_ = 0;
|
||||
int64_t compareScalarMask0Offset_ = 0;
|
||||
int64_t compareScalarMask1Offset_ = 0;
|
||||
int64_t gatherMaskOffset_ = 0;
|
||||
|
||||
int64_t totalLength_;
|
||||
int64_t expertStart_ = 0;
|
||||
int64_t expertEnd_ = 0;
|
||||
int64_t actual_expert_num_ = 0;
|
||||
int64_t cols_ = 0;
|
||||
int64_t rowIdxType_ = 0;
|
||||
int64_t isInputScale_ = 0;
|
||||
int64_t k_ = 0;
|
||||
|
||||
int64_t needSortNum_ = 0;
|
||||
|
||||
int64_t needCoreNum_ = 0;
|
||||
int64_t perCoreElements_ = 0;
|
||||
int64_t lastCoreElements_ = 0;
|
||||
int64_t curCoreElements_ = 0;
|
||||
int64_t curCoreStartIndex_ = 0;
|
||||
|
||||
bool needMultiSort = false;
|
||||
|
||||
int64_t kvFactor = 2;
|
||||
|
||||
static constexpr int64_t DST_BLK_STRIDE = 1;
|
||||
static constexpr int64_t DST_REP_STRIDE = 8;
|
||||
static constexpr int64_t MASK_STRIDE = 64;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::CopyIn()
|
||||
{
|
||||
LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(this->totalLength_ * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(expertIdx, expertIdxGm_, dataCopyParams, dataCopyPadParams);
|
||||
SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::SortCompute()
|
||||
{
|
||||
LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
|
||||
LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
|
||||
LocalTensor<int32_t> gatheredExpertIdx = buffer_.Get<int32_t>();
|
||||
LocalTensor<float> gatheredExpertIdxFp32 = gatheredExpertIdx.ReinterpretCast<float>();
|
||||
|
||||
Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, this->totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
Muls(expertIdxFp32, expertIdxFp32, (float)-1, this->totalLength_);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<uint8_t> compareScalarMaskLocalTensor0 = buffer_.Get<uint8_t>()[compareScalarMask0Offset_];
|
||||
LocalTensor<uint8_t> compareScalarMaskLocalTensor1 = buffer_.Get<uint8_t>()[compareScalarMask1Offset_];
|
||||
LocalTensor<uint8_t> gatherMaskLocalTensor = buffer_.Get<uint8_t>()[gatherMaskOffset_];
|
||||
|
||||
AscendC::CompareScalar(
|
||||
compareScalarMaskLocalTensor0, expertIdxFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::LE,
|
||||
(this->totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
AscendC::CompareScalar(
|
||||
compareScalarMaskLocalTensor1, expertIdxFp32, static_cast<float>(-expertEnd_), AscendC::CMPMODE::GT,
|
||||
(this->totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
And(gatherMaskLocalTensor.ReinterpretCast<uint16_t>(), compareScalarMaskLocalTensor0.ReinterpretCast<uint16_t>(),
|
||||
compareScalarMaskLocalTensor1.ReinterpretCast<uint16_t>(),
|
||||
Ceil(this->totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE / kvFactor);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
uint64_t rsvdCnt = 0;
|
||||
GatherMaskParams gatherMaskParams;
|
||||
gatherMaskParams.repeatTimes = 1;
|
||||
gatherMaskParams.src0BlockStride = 1;
|
||||
gatherMaskParams.src0RepeatStride = 8;
|
||||
gatherMaskParams.src1RepeatStride = 8;
|
||||
GatherMask(gatheredExpertIdxFp32, expertIdxFp32, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
|
||||
static_cast<uint32_t>(this->totalLength_), gatherMaskParams, rsvdCnt);
|
||||
PipeBarrier<PIPE_V>();
|
||||
actual_expert_num_ = rsvdCnt;
|
||||
// Handle actual_expert_num_ == 0
|
||||
if (actual_expert_num_ < 1) {
|
||||
return;
|
||||
}
|
||||
int64_t needSortNum = Ceil(static_cast<int64_t>(rsvdCnt), ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
needSortNum_ = needSortNum;
|
||||
|
||||
LocalTensor<int32_t> expertIndex = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
|
||||
LocalTensor<int32_t> gatheredExpertIndex = buffer_.Get<int32_t>()[needSortNum];
|
||||
ArithProgression<int32_t>(expertIndex, 0, 1, this->totalLength_);
|
||||
GatherMask(gatheredExpertIndex, expertIndex, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
|
||||
static_cast<uint32_t>(this->totalLength_), gatherMaskParams, rsvdCnt);
|
||||
PipeBarrier<PIPE_V>();
|
||||
if (rsvdCnt > MULTI_GATHERED_SORT_THRSHOLD) {
|
||||
if (GetBlockIdx() == 0) {
|
||||
SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
|
||||
DataCopyExtParams copyParams{1, static_cast<uint32_t>(rsvdCnt * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPad(workspaceGatheredExpertIdxGm_, gatheredExpertIdxFp32, copyParams);
|
||||
DataCopyPad(workspaceGatheredExpertIndexGm_, gatheredExpertIndex, copyParams);
|
||||
}
|
||||
needMultiSort = true;
|
||||
return;
|
||||
}
|
||||
int64_t duplicateNum = rsvdCnt % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = rsvdCnt - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(gatheredExpertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
}
|
||||
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> sortTempTensor = buffer_.Get<float>()[needSortNum * kvFactor];
|
||||
Concat(concatLocal, gatheredExpertIdxFp32, sortTempTensor, needSortNum / ONE_REPEAT_SORT_NUM);
|
||||
LocalTensor<float> sortedLocal = buffer_.Get<float>()[needSortNum * kvFactor + needSortNum * kvFactor * kvFactor];
|
||||
Sort<float, true>(sortedLocal, concatLocal, gatheredExpertIndex.ReinterpretCast<uint32_t>(), sortTempTensor,
|
||||
needSortNum / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
LocalTensor<float> sortedExpertIdx = gatheredExpertIdxFp32;
|
||||
LocalTensor<int32_t> sortedExpertIndex = gatheredExpertIndex.ReinterpretCast<int32_t>();
|
||||
|
||||
Extract(sortedExpertIdx, sortedExpertIndex.ReinterpretCast<uint32_t>(), sortedLocal,
|
||||
needSortNum / ONE_REPEAT_SORT_NUM);
|
||||
PipeBarrier<PIPE_V>();
|
||||
|
||||
LocalTensor<int32_t> sortedExpertIdxInt32 = sortedExpertIdx.ReinterpretCast<int32_t>();
|
||||
|
||||
Muls(sortedExpertIdx, sortedExpertIdx, (float)-1, rsvdCnt);
|
||||
Cast(sortedExpertIdxInt32, sortedExpertIdx, RoundMode::CAST_ROUND, rsvdCnt);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::TilingInKernel()
|
||||
{
|
||||
int64_t coreNum = needMultiSort ? MULTI_GATHERED_SORT_CORE_NUM : GetBlockNum();
|
||||
perCoreElements_ = Ceil(actual_expert_num_, coreNum);
|
||||
needCoreNum_ = Ceil(actual_expert_num_, perCoreElements_);
|
||||
lastCoreElements_ = actual_expert_num_ - (needCoreNum_ - 1) * perCoreElements_;
|
||||
if (GetBlockIdx() == needCoreNum_ - 1) {
|
||||
curCoreElements_ = lastCoreElements_;
|
||||
} else {
|
||||
curCoreElements_ = perCoreElements_;
|
||||
}
|
||||
curCoreStartIndex_ = GetBlockIdx() * perCoreElements_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::multiCoreGatheredSort()
|
||||
{
|
||||
needSortNum_ = Ceil(static_cast<int64_t>(curCoreElements_), ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
perCoreElements_ = Ceil(this->totalLength_, MULTI_GATHERED_SORT_CORE_NUM);
|
||||
|
||||
LocalTensor<int32_t> sortedNumOutLocal = sortedNumCopyOutQueue_.AllocTensor<int32_t>();
|
||||
LocalTensor<float> gatheredExpertIdxFp32 = buffer_.Get<float>();
|
||||
LocalTensor<int32_t> gatheredExpertIndex = buffer_.Get<int32_t>()[needSortNum_];
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(curCoreElements_ * sizeof(float)),
|
||||
0, 0, 0};
|
||||
DataCopyPadExtParams<float> expertIdxPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(gatheredExpertIdxFp32, workspaceGatheredExpertIdxGm_[curCoreStartIndex_], dataCopyParams,
|
||||
expertIdxPadParams);
|
||||
DataCopyPadExtParams<int32_t> expertIndexPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(gatheredExpertIndex, workspaceGatheredExpertIndexGm_[curCoreStartIndex_], dataCopyParams,
|
||||
expertIndexPadParams);
|
||||
SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
|
||||
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> sortTempTensor = buffer_.Get<float>()[needSortNum_ * kvFactor];
|
||||
// Duplicate MIN_FP32
|
||||
int64_t duplicateNum = curCoreElements_ % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = curCoreElements_ - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(gatheredExpertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
}
|
||||
Concat(concatLocal, gatheredExpertIdxFp32, sortTempTensor, needSortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
LocalTensor<float> sortedLocal = buffer_.Get<float>()[needSortNum_ * kvFactor + needSortNum_ * kvFactor * kvFactor];
|
||||
Sort<float, true>(sortedLocal, concatLocal, gatheredExpertIndex.ReinterpretCast<uint32_t>(), sortTempTensor,
|
||||
needSortNum_ / ONE_REPEAT_SORT_NUM);
|
||||
|
||||
// Copy out sortedLocal for MergeSort
|
||||
SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
|
||||
int64_t curCoreSortedStartIndex = kvFactor * GetBlockIdx() * perCoreElements_;
|
||||
dataCopyParams.blockLen = static_cast<uint32_t>(kvFactor * curCoreElements_ * sizeof(float));
|
||||
DataCopyPad(workspaceExpertIdxGm_[curCoreSortedStartIndex], sortedLocal, dataCopyParams);
|
||||
// Copyout sortedNum
|
||||
sortedNumOutLocal.SetValue(0, curCoreElements_);
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
dataCopyParams.blockLen = static_cast<uint32_t>(sizeof(int32_t));
|
||||
DataCopyPad(workspaceGatheredSortNumGm_[GetBlockIdx()], sortedNumOutLocal, dataCopyParams);
|
||||
sortedNumCopyOutQueue_.FreeTensor(sortedNumOutLocal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::CopyOutExpandRowIdx()
|
||||
{
|
||||
LocalTensor<int32_t> sortedExpertIndex = buffer_.Get<int32_t>()[needSortNum_];
|
||||
SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
|
||||
if (GetBlockIdx() == 0) {
|
||||
DataCopyExtParams copyParams{1, static_cast<uint32_t>(actual_expert_num_ * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPad(expendedRowIdxGm_, sortedExpertIndex, copyParams);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::ExpertCountCompute()
|
||||
{
|
||||
LocalTensor<int32_t> sortedExpertIdx = buffer_.Get<int32_t>()[curCoreStartIndex_];
|
||||
LocalTensor<int32_t> expertCountLocalTensor = buffer_.Get<int32_t>()[needSortNum_ * kvFactor];
|
||||
Duplicate(expertCountLocalTensor, 0, expertEnd_ - expertStart_);
|
||||
|
||||
for (int64_t i = 0; i < curCoreElements_; i++) {
|
||||
int64_t expertIdx = sortedExpertIdx.GetValue(i) - expertStart_;
|
||||
int32_t curExpertCount = expertCountLocalTensor.GetValue(expertIdx);
|
||||
expertCountLocalTensor.SetValue(expertIdx, curExpertCount + 1);
|
||||
}
|
||||
SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
|
||||
DataCopyExtParams copyOutParams1{1, static_cast<uint32_t>((expertEnd_ - expertStart_) * sizeof(int32_t)), 0, 0, 0};
|
||||
SetAtomicAdd<int32_t>();
|
||||
DataCopyPad(workspaceGm_, expertCountLocalTensor, copyOutParams1);
|
||||
SetAtomicNone();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::CopyOut()
|
||||
{
|
||||
LocalTensor<int32_t> sortedExpertIndex = buffer_.Get<int32_t>()[needSortNum_ + curCoreStartIndex_];
|
||||
int64_t xLocalOffset = (needSortNum_ * kvFactor + ASSIST_NUM) * sizeof(int32_t) / sizeof(T);
|
||||
LocalTensor<T> xLocalTensor = buffer_.Get<T>()[xLocalOffset];
|
||||
|
||||
for (int64_t i = 0; i < curCoreElements_; i++) {
|
||||
int64_t srcRow = sortedExpertIndex.GetValue(i) / k_;
|
||||
int64_t dstRow = i + curCoreStartIndex_;
|
||||
SetWaitFlag<HardEvent::S_MTE2>(HardEvent::S_MTE2);
|
||||
|
||||
LocalTensor<float> scaleLocalTensor;
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(xLocalTensor, xGm_[srcRow * cols_], dataCopyParams, dataCopyPadParams);
|
||||
if (isInputScale_ == 1) {
|
||||
scaleLocalTensor = scaleCopyInQueue_.AllocTensor<float>();
|
||||
DataCopyExtParams dataCopyParams2{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
DataCopyPadExtParams<float> dataCopyPadParams2{false, 0, 0, 0};
|
||||
DataCopyPad(scaleLocalTensor, scaleGm_[srcRow], dataCopyParams2, dataCopyPadParams2);
|
||||
scaleCopyInQueue_.EnQue<float>(scaleLocalTensor);
|
||||
}
|
||||
SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
|
||||
DataCopyExtParams copyOutParams1{1, static_cast<uint32_t>(cols_ * sizeof(T)), 0, 0, 0};
|
||||
DataCopyPad(expandedXGm_[dstRow * cols_], xLocalTensor, copyOutParams1);
|
||||
if (isInputScale_ == 1) {
|
||||
scaleLocalTensor = scaleCopyInQueue_.DeQue<float>();
|
||||
DataCopyExtParams copyOutParams2{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
|
||||
DataCopyPad(expandedScaleGm_[dstRow], scaleLocalTensor, copyOutParams2);
|
||||
scaleCopyInQueue_.FreeTensor(scaleLocalTensor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::CopyOutExpertCount()
|
||||
{
|
||||
LocalTensor<int32_t> expertCountLocalTensor = buffer_.Get<int32_t>()[needSortNum_ * kvFactor];
|
||||
LocalTensor<int64_t> expertCountLocalTensorInt64 =
|
||||
buffer_.Get<int32_t>()[needSortNum_ * kvFactor + ASSIST_NUM].ReinterpretCast<int64_t>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>((expertEnd_ - expertStart_) * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(expertCountLocalTensor, workspaceGm_, dataCopyParams, dataCopyPadParams);
|
||||
SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
|
||||
Cast(expertCountLocalTensorInt64, expertCountLocalTensor, RoundMode::CAST_NONE, (expertEnd_ - expertStart_));
|
||||
SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
|
||||
DataCopyExtParams copyOutParams1{1, static_cast<uint32_t>((expertEnd_ - expertStart_) * sizeof(int64_t)), 0, 0, 0};
|
||||
DataCopyPad(expertTokensCountOrCumsumGm_, expertCountLocalTensorInt64, copyOutParams1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void MoeSortActualExpert<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX,
|
||||
GM_ADDR expendedRowIdx, GM_ADDR expertTokensCountOrCumsum,
|
||||
GM_ADDR expandedScale, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
this->pipe = tPipe;
|
||||
this->totalLength_ = tilingData->n * tilingData->k;
|
||||
cols_ = tilingData->cols;
|
||||
expertStart_ = tilingData->expertStart;
|
||||
expertEnd_ = tilingData->expertEnd;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
isInputScale_ = tilingData->isInputScale;
|
||||
k_ = tilingData->k;
|
||||
|
||||
expertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expertIdx);
|
||||
|
||||
expendedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx);
|
||||
|
||||
xGm_.SetGlobalBuffer((__gm__ T *)x);
|
||||
scaleGm_.SetGlobalBuffer((__gm__ float *)scale);
|
||||
expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX);
|
||||
expertTokensCountOrCumsumGm_.SetGlobalBuffer((__gm__ int64_t *)expertTokensCountOrCumsum);
|
||||
expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
|
||||
workspaceGm_.SetGlobalBuffer((__gm__ int32_t *)workspace, ASSIST_NUM);
|
||||
if (GetBlockIdx() == 0) {
|
||||
InitGlobalMemory(workspaceGm_, ASSIST_NUM, 0);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
workspaceExpertIdxGm_.SetGlobalBuffer((__gm__ float *)workspace);
|
||||
int64_t offset = kvFactor * Align(this->totalLength_, sizeof(int32_t));
|
||||
workspaceGatheredExpertIdxGm_.SetGlobalBuffer((__gm__ float *)workspace + offset);
|
||||
offset += Align(this->totalLength_, sizeof(float));
|
||||
workspaceGatheredExpertIndexGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + offset);
|
||||
offset += Align(this->totalLength_, sizeof(float));
|
||||
workspaceGatheredSortNumGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + offset);
|
||||
|
||||
expertIdxOffset_ = AlignBytes(this->totalLength_, sizeof(int32_t));
|
||||
expertIndexOffset_ = expertIdxOffset_;
|
||||
|
||||
gatherMaskOffset_ = expertIdxOffset_ * kvFactor;
|
||||
int64_t maskOffset =
|
||||
AlignBytes(Ceil(this->totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE, sizeof(int8_t));
|
||||
compareScalarMask0Offset_ = gatherMaskOffset_ + maskOffset;
|
||||
compareScalarMask1Offset_ = compareScalarMask0Offset_ + maskOffset;
|
||||
int64_t maskOffsetMax = Ceil(SINGLE_GATHERED_MAX_NUM, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE;
|
||||
int64_t bufferSize =
|
||||
AlignBytes(SINGLE_GATHERED_MAX_NUM, sizeof(int32_t)) * kvFactor + maskOffsetMax + maskOffsetMax + maskOffsetMax;
|
||||
pipe->InitBuffer(scaleCopyInQueue_, SINGLE_GATHERED_BUFFER_NUM, 32);
|
||||
pipe->InitBuffer(sortedNumCopyOutQueue_, SINGLE_GATHERED_BUFFER_NUM, 32);
|
||||
pipe->InitBuffer(buffer_, bufferSize); // 182992 Bytes
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline bool MoeSortActualExpert<T>::Process()
|
||||
{
|
||||
CopyIn();
|
||||
SortCompute();
|
||||
TilingInKernel();
|
||||
if (needMultiSort) {
|
||||
SyncAll();
|
||||
if (GetBlockIdx() < needCoreNum_) {
|
||||
multiCoreGatheredSort();
|
||||
}
|
||||
SyncAll();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (GetBlockIdx() < needCoreNum_) {
|
||||
CopyOutExpandRowIdx();
|
||||
}
|
||||
if (GetBlockIdx() < needCoreNum_) {
|
||||
ExpertCountCompute();
|
||||
CopyOut();
|
||||
}
|
||||
SyncAll();
|
||||
if (GetBlockIdx() == GetBlockNum() - 1) {
|
||||
CopyOutExpertCount();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_SORT_ACTUAL_EXPERT_H
|
||||
@@ -0,0 +1,71 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_sort_base.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_SORT_BASE_H
|
||||
#define MOE_CUSTOM_SORT_BASE_H
|
||||
|
||||
#include "kernel_operator.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
class MoeSortBase {
|
||||
public:
|
||||
__aicore__ inline MoeSortBase(){};
|
||||
__aicore__ inline int64_t GetSyncRound();
|
||||
|
||||
protected:
|
||||
__aicore__ inline void CleanWSCache();
|
||||
__aicore__ inline void SyncAll();
|
||||
|
||||
protected:
|
||||
TPipe *pipe;
|
||||
TQue<QuePosition::VECIN, 1> sortDataCopyInQueue;
|
||||
TQue<QuePosition::VECOUT, 1> sortDataCopyOutQueue;
|
||||
TBuf<TPosition::VECCALC> tempBuffer;
|
||||
TBuf<TPosition::VECCALC> sortedBuffer;
|
||||
|
||||
GlobalTensor<int32_t> expertIdxGm;
|
||||
GlobalTensor<int32_t> expendedRowIdxGm;
|
||||
GlobalTensor<int32_t> sortedExpertForSourceRowGm;
|
||||
GlobalTensor<int32_t> expandDstToSrcRowGm;
|
||||
GlobalTensor<int32_t> sortedexpertIdxGm;
|
||||
GlobalTensor<int32_t> expertCountTempGm;
|
||||
|
||||
int64_t tileLength;
|
||||
int64_t bufferNum = 1;
|
||||
int64_t totalLength;
|
||||
int64_t coreNum;
|
||||
|
||||
int64_t expertStart_ = 0;
|
||||
int64_t expertEnd_ = 0;
|
||||
int64_t n;
|
||||
int64_t k;
|
||||
int64_t ep_ = 0;
|
||||
int64_t oneLoopMaxElements_;
|
||||
int64_t rowIdxType_ = 0;
|
||||
|
||||
static constexpr int64_t SYNC_GM_NUM = 2;
|
||||
static constexpr int64_t WORK_GM_NUM = 2;
|
||||
static constexpr int64_t DST_BLK_STRIDE = 1;
|
||||
static constexpr int64_t DST_REP_STRIDE = 8;
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeSortBase::SyncAll()
|
||||
{
|
||||
AscendC::SyncAll();
|
||||
}
|
||||
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_SORT_BASE_H
|
||||
@@ -0,0 +1,377 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_sort_multi_core.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_VBS_ONE_CORE_H
|
||||
#define MOE_CUSTOM_VBS_ONE_CORE_H
|
||||
|
||||
#include "moe_custom_sort_base.h"
|
||||
#include "moe_custom_mrgsort.h"
|
||||
#include "moe_custom_mrgsort_out.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
class MoeSortMultiCore : public MoeSortBase {
|
||||
public:
|
||||
__aicore__ inline MoeSortMultiCore(){};
|
||||
__aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void VBSProcess();
|
||||
__aicore__ inline void UBSortProcess(int64_t progress, int64_t size, int64_t sortNum);
|
||||
__aicore__ inline void OneCoreVMSProcess(int64_t listNum, int64_t perListElements, int64_t lastListElements);
|
||||
__aicore__ inline void VMSProcess();
|
||||
__aicore__ inline void SortOutProcess();
|
||||
__aicore__ inline void VBSCopyIn(int64_t progress, int64_t size, int64_t sortNum);
|
||||
__aicore__ inline void UBSortCompute(int64_t progress, int64_t size, int64_t sortNum);
|
||||
__aicore__ inline void VBSCopyOut(int64_t progress, int64_t size, int64_t sortNum);
|
||||
__aicore__ inline void InitMoeMrgSort(MoeMrgsort *sorter, int64_t listNum, int64_t coreOffset, int64_t loopOffset);
|
||||
__aicore__ inline void InitMoeMrgSortOut(MoeMrgsortOut *sorter, int64_t listNum, int64_t coreOffset);
|
||||
|
||||
private:
|
||||
GlobalTensor<float> workspaceGms[2];
|
||||
// GlobalTensor<int64_t> expertTokensCountGm_;
|
||||
|
||||
const MoeCustomVBSComputeTilingData *vbsTilingData;
|
||||
const MoeCustomVMSMiddleComputeTilingData *vmsTilingData;
|
||||
const MoeCustomSortOutComputeTilingData *sortOutTilingData;
|
||||
|
||||
// for MoeMrgsort
|
||||
MoeMrgsort mrgsorter;
|
||||
MoeMrgsortParam mrgsortParam;
|
||||
|
||||
int64_t coreNum;
|
||||
int64_t blockIdx;
|
||||
int64_t srcWsIndex = 0;
|
||||
|
||||
int64_t listNum;
|
||||
int64_t perListElements;
|
||||
int64_t lastListElements;
|
||||
|
||||
int64_t sortTotalLength;
|
||||
int64_t sortCoreLoops;
|
||||
int64_t sortCoreLoopElements;
|
||||
int64_t sortCoreLastLoopElements;
|
||||
|
||||
int64_t perCoreExpert;
|
||||
int64_t needInitExpertCore;
|
||||
int64_t currentCoreExpert;
|
||||
|
||||
static constexpr int64_t MAX_MRGSORT_LIST = 4;
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::VBSCopyIn(int64_t progress, int64_t size, int64_t sortNum)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue.AllocTensor<int32_t>();
|
||||
int64_t inOffset = progress * sortCoreLoopElements;
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(size * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal[0], expertIdxGm[inOffset], dataCopyParams, dataCopyPadParams);
|
||||
|
||||
LocalTensor<int32_t> rowIdxLocal = inLocal[sortNum];
|
||||
int64_t startValue = this->blockIdx * this->vbsTilingData->perCoreElements + inOffset;
|
||||
SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
|
||||
ArithProgression<int32_t>(rowIdxLocal, startValue, 1, size);
|
||||
sortDataCopyInQueue.EnQue(inLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::UBSortCompute(int64_t progress, int64_t size, int64_t sortNum)
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expertForSourceRowLocal = inLocal[0];
|
||||
LocalTensor<float> expertForSourceRowLocalFp32;
|
||||
|
||||
expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
|
||||
Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum);
|
||||
|
||||
Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum);
|
||||
|
||||
if (ep_) {
|
||||
LocalTensor<uint8_t> maskLocalTensor = sortedBuffer.Get<uint8_t>();
|
||||
AscendC::CompareScalar(
|
||||
maskLocalTensor, expertForSourceRowLocalFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::GT,
|
||||
(sortNum + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
|
||||
LocalTensor<float> floatMinLocalTensor = tempBuffer.Get<float>();
|
||||
Duplicate(floatMinLocalTensor, MIN_FP32, sortNum);
|
||||
Select(expertForSourceRowLocalFp32, maskLocalTensor, floatMinLocalTensor, expertForSourceRowLocalFp32,
|
||||
SELMODE::VSEL_TENSOR_TENSOR_MODE, sortNum);
|
||||
}
|
||||
|
||||
int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = size - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
}
|
||||
|
||||
LocalTensor<float> concatLocal = expertForSourceRowLocalFp32;
|
||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(sortNum));
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
LocalTensor<uint32_t> sourceRowLocal;
|
||||
sourceRowLocal = inLocal[sortNum].ReinterpretCast<uint32_t>();
|
||||
Sort<float, true>(outLocal, concatLocal, sourceRowLocal, sortedLocal, sortNum / ONE_REPEAT_SORT_NUM);
|
||||
|
||||
sortDataCopyOutQueue.EnQue<float>(outLocal);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::VBSCopyOut(int64_t progress, int64_t size, int64_t sortNum)
|
||||
{
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.DeQue<float>();
|
||||
DataCopy(workspaceGms[0][this->blockIdx * GetSortLen<float>(this->vbsTilingData->perCoreElements) +
|
||||
GetSortLen<float>(progress * sortCoreLoopElements)],
|
||||
outLocal, Align(GetSortLen<float>(size), sizeof(float)));
|
||||
sortDataCopyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::InitMoeMrgSort(MoeMrgsort *sorter, int64_t listNum, int64_t coreOffset,
|
||||
int64_t loopOffset)
|
||||
{
|
||||
GlobalTensor<float> srcWsGm = workspaceGms[srcWsIndex][blockIdx * coreOffset + loopOffset];
|
||||
LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
for (int64_t i = 0; i < listNum; i++) {
|
||||
LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(oneLoopMaxElements_) * i];
|
||||
sorter->SetInput(srcWsGm, inLocalT);
|
||||
}
|
||||
GlobalTensor<float> dstWsGm = workspaceGms[1 - srcWsIndex][blockIdx * coreOffset + loopOffset];
|
||||
sorter->SetOutput(dstWsGm, outLocal);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
sortDataCopyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::InitMoeMrgSortOut(MoeMrgsortOut *sorter, int64_t listNum, int64_t coreOffset)
|
||||
{
|
||||
GlobalTensor<float> srcWsGm = workspaceGms[srcWsIndex];
|
||||
LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
|
||||
for (int64_t i = 0; i < listNum; i++) {
|
||||
LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(oneLoopMaxElements_) * i];
|
||||
sorter->SetInput(srcWsGm, inLocalT);
|
||||
}
|
||||
|
||||
LocalTensor<float> outLocalV = outLocal[oneLoopMaxElements_ * MAX_MRGSORT_LIST];
|
||||
sorter->SetOutput(this->sortedexpertIdxGm, this->expendedRowIdxGm, outLocal, outLocalV);
|
||||
|
||||
LocalTensor<float> tempBuffer = sortedBuffer.Get<float>(GetSortLen<float>(oneLoopMaxElements_) * MAX_MRGSORT_LIST);
|
||||
sorter->SetBuffer(tempBuffer);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
sortDataCopyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::OneCoreVMSProcess(int64_t listNum, int64_t perListElements,
|
||||
int64_t lastListElements)
|
||||
{
|
||||
int64_t coreOffset = GetSortLen<float>(this->vbsTilingData->perCoreElements);
|
||||
mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
|
||||
|
||||
for (int64_t i = 0; listNum >= 1; i++) {
|
||||
int64_t loops = (listNum + MAX_MRGSORT_LIST - 1) / MAX_MRGSORT_LIST;
|
||||
int64_t remainListNum = listNum - (loops - 1) * MAX_MRGSORT_LIST;
|
||||
|
||||
mrgsortParam.perListElements = perListElements;
|
||||
mrgsortParam.lastListElements = perListElements;
|
||||
|
||||
int64_t loopOffset = GetSortLen<float>(mrgsortParam.perListElements * MAX_MRGSORT_LIST);
|
||||
for (int64_t loop = 0; loop < loops - 1; loop++) {
|
||||
InitMoeMrgSort(&mrgsorter, MAX_MRGSORT_LIST, coreOffset, loop * loopOffset);
|
||||
mrgsorter.Init(&mrgsortParam);
|
||||
mrgsorter.Process();
|
||||
}
|
||||
|
||||
mrgsortParam.perListElements = perListElements;
|
||||
mrgsortParam.lastListElements = lastListElements;
|
||||
InitMoeMrgSort(&mrgsorter, remainListNum, coreOffset, (loops - 1) * loopOffset);
|
||||
mrgsorter.Init(&mrgsortParam);
|
||||
mrgsorter.Process();
|
||||
|
||||
listNum = loops;
|
||||
lastListElements = perListElements * (remainListNum - 1) + lastListElements;
|
||||
perListElements = perListElements * MAX_MRGSORT_LIST;
|
||||
srcWsIndex = (srcWsIndex + 1) % WORK_GM_NUM;
|
||||
if (loops == 1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::UBSortProcess(int64_t progress, int64_t size, int64_t sortNum)
|
||||
{
|
||||
VBSCopyIn(progress, size, sortNum);
|
||||
UBSortCompute(progress, size, sortNum);
|
||||
VBSCopyOut(progress, size, sortNum);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::VBSProcess()
|
||||
{
|
||||
if (this->blockIdx < this->vbsTilingData->needCoreNum) {
|
||||
int64_t sortNum = Ceil(sortCoreLoopElements, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
for (int64_t loop = 0; loop < sortCoreLoops - 1; loop++) {
|
||||
UBSortProcess(loop, sortCoreLoopElements, sortNum);
|
||||
}
|
||||
|
||||
sortNum = Ceil(sortCoreLastLoopElements, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
UBSortProcess(sortCoreLoops - 1, sortCoreLastLoopElements, sortNum);
|
||||
|
||||
if (sortCoreLoops > 1) {
|
||||
OneCoreVMSProcess(sortCoreLoops, sortCoreLoopElements, sortCoreLastLoopElements);
|
||||
}
|
||||
}
|
||||
SyncAll();
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::VMSProcess()
|
||||
{
|
||||
int64_t currentStageNeedCoreNum = this->vmsTilingData->needCoreNum;
|
||||
perListElements = this->vbsTilingData->perCoreElements;
|
||||
lastListElements = this->vbsTilingData->lastCoreElements;
|
||||
listNum = this->vbsTilingData->needCoreNum;
|
||||
|
||||
for (; listNum > MAX_MRGSORT_LIST;) {
|
||||
currentStageNeedCoreNum = Ceil(listNum, MAX_MRGSORT_LIST);
|
||||
int64_t coreOffset = GetSortLen<float>(perListElements * MAX_MRGSORT_LIST);
|
||||
int64_t remainListNum = listNum - (currentStageNeedCoreNum - 1) * MAX_MRGSORT_LIST;
|
||||
|
||||
if (this->blockIdx < currentStageNeedCoreNum - 1) {
|
||||
mrgsortParam.perListElements = perListElements;
|
||||
mrgsortParam.lastListElements = perListElements;
|
||||
mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
|
||||
InitMoeMrgSort(&mrgsorter, MAX_MRGSORT_LIST, coreOffset, 0);
|
||||
mrgsorter.Init(&mrgsortParam);
|
||||
mrgsorter.Process();
|
||||
} else if (this->blockIdx == currentStageNeedCoreNum - 1) {
|
||||
mrgsortParam.perListElements = perListElements;
|
||||
mrgsortParam.lastListElements = lastListElements;
|
||||
mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
|
||||
InitMoeMrgSort(&mrgsorter, remainListNum, coreOffset, 0);
|
||||
mrgsorter.Init(&mrgsortParam);
|
||||
mrgsorter.Process();
|
||||
}
|
||||
listNum = currentStageNeedCoreNum;
|
||||
currentStageNeedCoreNum = Ceil(listNum, MAX_MRGSORT_LIST);
|
||||
srcWsIndex = (srcWsIndex + 1) % WORK_GM_NUM;
|
||||
|
||||
lastListElements = perListElements * (remainListNum - 1) + lastListElements;
|
||||
perListElements = perListElements * MAX_MRGSORT_LIST;
|
||||
|
||||
SyncAll();
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::SortOutProcess()
|
||||
{
|
||||
if (this->blockIdx < 1) {
|
||||
mrgsortParam.perListElements = perListElements;
|
||||
mrgsortParam.lastListElements = lastListElements;
|
||||
mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
|
||||
|
||||
MoeMrgsortOut sorter;
|
||||
InitMoeMrgSortOut(&sorter, listNum, GetSortLen<float>(perListElements));
|
||||
sorter.Init(&mrgsortParam, pipe);
|
||||
sorter.Process();
|
||||
}
|
||||
SyncAll();
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
this->totalLength = tilingData->n * tilingData->k;
|
||||
this->coreNum = tilingData->coreNum;
|
||||
this->vbsTilingData = &(tilingData->vbsComputeParamsOp);
|
||||
this->vmsTilingData = &(tilingData->vmsMiddleComputeParamsOp);
|
||||
this->sortOutTilingData = &(tilingData->sortOutComputeParamsOp);
|
||||
|
||||
this->blockIdx = GetBlockIdx();
|
||||
this->tileLength = this->vbsTilingData->perCorePerLoopElements;
|
||||
this->sortTotalLength = this->vbsTilingData->perCoreElements;
|
||||
if (this->blockIdx == tilingData->vbsComputeParamsOp.needCoreNum - 1) {
|
||||
this->tileLength = this->vbsTilingData->lastCorePerLoopElements;
|
||||
this->sortTotalLength = this->vbsTilingData->lastCoreElements;
|
||||
}
|
||||
this->n = tilingData->n;
|
||||
this->k = tilingData->k;
|
||||
this->ep_ = tilingData->ep;
|
||||
this->oneLoopMaxElements_ = ep_ ? this->sortOutTilingData->oneLoopMaxElements : MRGSORT_LIST_MAX_ELEMENT;
|
||||
|
||||
expertStart_ = tilingData->expertStart;
|
||||
expertEnd_ = tilingData->expertEnd;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
|
||||
// VBS param init
|
||||
if (this->blockIdx == this->vbsTilingData->needCoreNum - 1) {
|
||||
sortCoreLoops = this->vbsTilingData->lastCoreLoops;
|
||||
sortCoreLoopElements = this->vbsTilingData->lastCorePerLoopElements;
|
||||
sortCoreLastLoopElements = this->vbsTilingData->lastCoreLastLoopElements;
|
||||
} else {
|
||||
sortCoreLoops = this->vbsTilingData->perCoreLoops;
|
||||
sortCoreLoopElements = this->vbsTilingData->perCorePerLoopElements;
|
||||
sortCoreLastLoopElements = this->vbsTilingData->perCoreLastLoopElements;
|
||||
}
|
||||
|
||||
this->pipe = tPipe;
|
||||
expertIdxGm.SetGlobalBuffer((__gm__ int32_t *)expertIdx +
|
||||
this->blockIdx * tilingData->vbsComputeParamsOp.perCoreElements,
|
||||
this->sortTotalLength);
|
||||
sortedexpertIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(workspace),
|
||||
Align(this->totalLength, sizeof(int32_t)));
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx, Align(this->totalLength, sizeof(int32_t)));
|
||||
} else {
|
||||
expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->totalLength, sizeof(int32_t)),
|
||||
Align(this->totalLength, sizeof(int32_t)));
|
||||
}
|
||||
|
||||
if (GetBlockIdx() == 0) {
|
||||
expertCountTempGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2,
|
||||
tilingData->actualExpertNum);
|
||||
InitGlobalMemory(expertCountTempGm, tilingData->actualExpertNum, 0);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
|
||||
// key and value
|
||||
int64_t kvFactor = 2;
|
||||
workspaceGms[0].SetGlobalBuffer((__gm__ float *)workspace + Align(this->totalLength, sizeof(int32_t)) * 2 +
|
||||
tilingData->actualExpertNum,
|
||||
Align(this->totalLength, sizeof(int32_t)) * kvFactor);
|
||||
workspaceGms[1].SetGlobalBuffer((__gm__ float *)workspace +
|
||||
Align(this->totalLength, sizeof(int32_t)) * (kvFactor + 2) +
|
||||
tilingData->actualExpertNum,
|
||||
Align(this->totalLength, sizeof(int32_t)) * kvFactor);
|
||||
|
||||
int64_t bufferSize = Ceil(Max(oneLoopMaxElements_ * MAX_MRGSORT_LIST, sortCoreLoopElements), ONE_REPEAT_SORT_NUM) *
|
||||
ONE_REPEAT_SORT_NUM * sizeof(int32_t) * kvFactor;
|
||||
pipe->InitBuffer(sortDataCopyInQueue, bufferNum, bufferSize);
|
||||
pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, bufferSize);
|
||||
pipe->InitBuffer(sortedBuffer, bufferSize);
|
||||
if (ep_) {
|
||||
pipe->InitBuffer(tempBuffer, bufferSize);
|
||||
}
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCore::Process()
|
||||
{
|
||||
VBSProcess();
|
||||
VMSProcess();
|
||||
SortOutProcess();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_VBS_ONE_CORE_H
|
||||
@@ -0,0 +1,171 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_sort_multi_core_performance.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_VBS_ONE_CORE_PERFORMANCE_H
|
||||
#define MOE_CUSTOM_VBS_ONE_CORE_PERFORMANCE_H
|
||||
|
||||
#include "moe_custom_sort_base.h"
|
||||
#include "moe_custom_mrgsort_performance.h"
|
||||
#include "moe_custom_mrgsort_out_performance.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
class MoeSortMultiCorePerformance : public MoeSortBase {
|
||||
public:
|
||||
__aicore__ inline MoeSortMultiCorePerformance(){};
|
||||
__aicore__ inline void Init(GM_ADDR expendedRowIdx, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
|
||||
TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void VMSProcess();
|
||||
__aicore__ inline void SortOutProcess();
|
||||
__aicore__ inline void InitMoeMrgSort(MoeMrgsortPerformance *sorter, int64_t coreOffset);
|
||||
__aicore__ inline void InitMoeMrgSortOut(MoeMrgsortOutPerformance *sorter);
|
||||
|
||||
private:
|
||||
GlobalTensor<float> workspaceGms[2];
|
||||
GlobalTensor<int32_t> workspaceGatheredSortNumGm_;
|
||||
|
||||
const MoeCustomSortOutComputeTilingData *sortOutTilingData;
|
||||
const MoeCustomVBSComputeTilingData *vbsTilingData;
|
||||
|
||||
// for MoeMrgsortPerformance
|
||||
MoeMrgsortPerformance mrgsorter;
|
||||
MoeMrgsortPerformanceParam mrgsortParam;
|
||||
|
||||
int64_t blockIdx;
|
||||
|
||||
int64_t perListElements;
|
||||
int64_t maxPerListElements;
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeSortMultiCorePerformance::InitMoeMrgSort(MoeMrgsortPerformance *sorter, int64_t coreOffset)
|
||||
{
|
||||
GlobalTensor<float> srcWsGm = workspaceGms[0][this->blockIdx * coreOffset]; // 0-3
|
||||
LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
GlobalTensor<int32_t> sortNumGm = workspaceGatheredSortNumGm_[this->blockIdx * MAX_MRGSORT_LIST];
|
||||
for (int64_t i = 0; i < MAX_MRGSORT_LIST; i++) {
|
||||
LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(maxPerListElements) * i];
|
||||
sorter->SetInput(srcWsGm, inLocalT, sortNumGm);
|
||||
}
|
||||
GlobalTensor<float> dstWsGm = workspaceGms[1][this->blockIdx * coreOffset];
|
||||
sorter->SetOutput(dstWsGm, outLocal);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
sortDataCopyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCorePerformance::InitMoeMrgSortOut(MoeMrgsortOutPerformance *sorter)
|
||||
{
|
||||
GlobalTensor<float> srcWsGm = workspaceGms[1];
|
||||
LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
GlobalTensor<int32_t> sortNumGm = workspaceGatheredSortNumGm_;
|
||||
for (int64_t i = 0; i < MAX_MRGSORT_LIST; i++) {
|
||||
LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(maxPerListElements) * i];
|
||||
sorter->SetInput(srcWsGm, inLocalT, sortNumGm);
|
||||
}
|
||||
|
||||
LocalTensor<float> outLocalV = outLocal[maxPerListElements * MAX_MRGSORT_LIST];
|
||||
sorter->SetOutput(this->sortedexpertIdxGm, this->expendedRowIdxGm, outLocal, outLocalV);
|
||||
|
||||
LocalTensor<float> tempBuffer = sortedBuffer.Get<float>(GetSortLen<float>(maxPerListElements) * MAX_MRGSORT_LIST);
|
||||
sorter->SetBuffer(tempBuffer);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
sortDataCopyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCorePerformance::VMSProcess()
|
||||
{
|
||||
int64_t currentStageNeedCoreNum = MAX_MRGSORT_LIST;
|
||||
int64_t coreOffset = GetSortLen<float>(perListElements * MAX_MRGSORT_LIST);
|
||||
if (this->blockIdx <= currentStageNeedCoreNum - 1) {
|
||||
mrgsortParam.perListElements = perListElements;
|
||||
mrgsortParam.oneLoopMaxElements = maxPerListElements;
|
||||
InitMoeMrgSort(&mrgsorter, coreOffset);
|
||||
mrgsorter.Init(&mrgsortParam);
|
||||
mrgsorter.Process();
|
||||
}
|
||||
SyncAll();
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCorePerformance::SortOutProcess()
|
||||
{
|
||||
if (this->blockIdx < 1) {
|
||||
mrgsortParam.perListElements = perListElements;
|
||||
mrgsortParam.oneLoopMaxElements = maxPerListElements;
|
||||
MoeMrgsortOutPerformance sorter;
|
||||
InitMoeMrgSortOut(&sorter);
|
||||
sorter.Init(&mrgsortParam, pipe);
|
||||
sorter.Process();
|
||||
InitGlobalMemory(expertCountTempGm, expertEnd_ - expertStart_, 0);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
SyncAll();
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCorePerformance::Init(GM_ADDR expendedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
this->totalLength = tilingData->n * tilingData->k;
|
||||
this->blockIdx = GetBlockIdx();
|
||||
this->n = tilingData->n;
|
||||
this->k = tilingData->k;
|
||||
this->vbsTilingData = &(tilingData->vbsComputeParamsOp);
|
||||
this->sortOutTilingData = &(tilingData->sortOutComputeParamsOp);
|
||||
this->perListElements = Ceil(this->totalLength, MAX_MRGSORT_LIST_TOTAL);
|
||||
this->maxPerListElements = this->sortOutTilingData->oneLoopMaxElements;
|
||||
|
||||
expertStart_ = tilingData->expertStart;
|
||||
expertEnd_ = tilingData->expertEnd;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
|
||||
this->pipe = tPipe;
|
||||
sortedexpertIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(workspace),
|
||||
Align(this->totalLength, sizeof(int32_t)));
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx, Align(this->totalLength, sizeof(int32_t)));
|
||||
} else {
|
||||
expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->totalLength, sizeof(int32_t)),
|
||||
Align(this->totalLength, sizeof(int32_t)));
|
||||
}
|
||||
|
||||
// key and value
|
||||
int64_t kvFactor = 2;
|
||||
workspaceGms[0].SetGlobalBuffer((__gm__ float *)workspace, Align(this->totalLength, sizeof(float)) * kvFactor);
|
||||
workspaceGms[1].SetGlobalBuffer((__gm__ float *)workspace + Align(this->totalLength, sizeof(float)) * kvFactor,
|
||||
Align(this->totalLength, sizeof(float)) * kvFactor);
|
||||
workspaceGatheredSortNumGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(this->totalLength, sizeof(int32_t)) * kvFactor * kvFactor,
|
||||
MAX_MRGSORT_LIST_TOTAL);
|
||||
expertCountTempGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->totalLength, sizeof(int32_t)) * 2,
|
||||
expertEnd_ - expertStart_);
|
||||
|
||||
int64_t bufferSize = Ceil(maxPerListElements * MAX_MRGSORT_LIST, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM *
|
||||
sizeof(float) * kvFactor;
|
||||
pipe->InitBuffer(sortDataCopyInQueue, bufferNum, bufferSize);
|
||||
pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, bufferSize);
|
||||
pipe->InitBuffer(sortedBuffer, bufferSize);
|
||||
pipe->InitBuffer(tempBuffer, bufferSize);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortMultiCorePerformance::Process()
|
||||
{
|
||||
VMSProcess();
|
||||
SortOutProcess();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_VBS_ONE_CORE_PERFORMANCE_H
|
||||
@@ -0,0 +1,167 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_custom_sort_one_core.h
|
||||
* \brief
|
||||
*/
|
||||
#ifndef MOE_CUSTOM_SORT_ONE_CORE_H
|
||||
#define MOE_CUSTOM_SORT_ONE_CORE_H
|
||||
|
||||
#include "moe_custom_sort_base.h"
|
||||
|
||||
namespace MoeInitRoutingCustom {
|
||||
using namespace AscendC;
|
||||
|
||||
class MoeSortOneCore : public MoeSortBase {
|
||||
public:
|
||||
__aicore__ inline MoeSortOneCore(){};
|
||||
__aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
private:
|
||||
__aicore__ inline void CopyIn();
|
||||
__aicore__ inline void SortCompute();
|
||||
__aicore__ inline void ExpertCountCompute();
|
||||
__aicore__ inline void CopyOut();
|
||||
|
||||
private:
|
||||
int64_t sortNum;
|
||||
};
|
||||
|
||||
__aicore__ inline void MoeSortOneCore::CopyIn()
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue.AllocTensor<int32_t>();
|
||||
DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
|
||||
static_cast<uint32_t>(this->totalLength * sizeof(int32_t)), 0, 0, 0};
|
||||
DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
|
||||
DataCopyPad(inLocal[0], expertIdxGm, dataCopyParams, dataCopyPadParams);
|
||||
LocalTensor<int32_t> rowIdxLocal = inLocal[this->sortNum];
|
||||
ArithProgression<int32_t>(rowIdxLocal, 0, 1, this->sortNum);
|
||||
sortDataCopyInQueue.EnQue(inLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortOneCore::SortCompute()
|
||||
{
|
||||
LocalTensor<int32_t> inLocal = sortDataCopyInQueue.DeQue<int32_t>();
|
||||
LocalTensor<int32_t> expertIdx = inLocal[0];
|
||||
LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
|
||||
Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, this->tileLength);
|
||||
Muls(expertIdxFp32, expertIdxFp32, (float)-1, this->tileLength);
|
||||
|
||||
if (ep_) {
|
||||
LocalTensor<uint8_t> maskLocalTensor = sortedBuffer.Get<uint8_t>();
|
||||
AscendC::CompareScalar(maskLocalTensor, expertIdxFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::GT,
|
||||
(this->totalLength + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM *
|
||||
ONE_REPEAT_COMPARE_NUM);
|
||||
LocalTensor<float> floatMinLocalTensor = tempBuffer.Get<float>();
|
||||
Duplicate(floatMinLocalTensor, MIN_FP32, this->tileLength);
|
||||
Select(expertIdxFp32, maskLocalTensor, floatMinLocalTensor, expertIdxFp32, SELMODE::VSEL_TENSOR_TENSOR_MODE,
|
||||
this->totalLength);
|
||||
}
|
||||
|
||||
int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
|
||||
if (duplicateNum > 0) {
|
||||
int duplicateIndex = this->totalLength - duplicateNum;
|
||||
uint64_t mask0 = UINT64_MAX;
|
||||
mask0 = mask0 << duplicateNum;
|
||||
mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
|
||||
uint64_t mask[2] = {mask0, 0};
|
||||
Duplicate(expertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
|
||||
}
|
||||
|
||||
LocalTensor<float> concatLocal;
|
||||
LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
Concat(concatLocal, expertIdxFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
|
||||
LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
|
||||
LocalTensor<uint32_t> sourceRowLocal;
|
||||
sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||
Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
|
||||
LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
|
||||
LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
|
||||
LocalTensor<uint32_t> expandDstToSrcRowLocal;
|
||||
expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
|
||||
Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
|
||||
Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
|
||||
|
||||
LocalTensor<int32_t> expertForSourceRowLocalInt32;
|
||||
expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
|
||||
Cast(expertForSourceRowLocalInt32, sortedExpertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
|
||||
sortDataCopyOutQueue.EnQue<float>(outLocal);
|
||||
sortDataCopyInQueue.FreeTensor(inLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortOneCore::CopyOut()
|
||||
{
|
||||
LocalTensor<int32_t> outLocal = sortDataCopyOutQueue.DeQue<int32_t>();
|
||||
DataCopyParams intriParams;
|
||||
intriParams.blockCount = 1;
|
||||
intriParams.blockLen = this->totalLength * sizeof(int32_t);
|
||||
DataCopyPad(sortedexpertIdxGm, outLocal[0], intriParams);
|
||||
DataCopyPad(expendedRowIdxGm, outLocal[this->sortNum], intriParams);
|
||||
sortDataCopyOutQueue.FreeTensor(outLocal);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortOneCore::Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
|
||||
const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
|
||||
{
|
||||
this->pipe = tPipe;
|
||||
this->tileLength = Align(tilingData->vbsComputeParamsOp.lastCorePerLoopElements, sizeof(int32_t));
|
||||
this->sortNum = Ceil(this->tileLength, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
|
||||
this->totalLength = tilingData->n * tilingData->k;
|
||||
this->coreNum = tilingData->coreNum;
|
||||
this->ep_ = tilingData->ep;
|
||||
expertStart_ = tilingData->expertStart;
|
||||
expertEnd_ = tilingData->expertEnd;
|
||||
rowIdxType_ = tilingData->rowIdxType;
|
||||
|
||||
expertIdxGm.SetGlobalBuffer((__gm__ int32_t *)expertIdx, this->tileLength);
|
||||
sortedexpertIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(workspace),
|
||||
Align(this->totalLength, sizeof(int32_t)));
|
||||
if (rowIdxType_ == SCATTER) {
|
||||
expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx, this->tileLength);
|
||||
} else {
|
||||
expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->tileLength, sizeof(int32_t)),
|
||||
Align(this->tileLength, sizeof(int32_t)));
|
||||
}
|
||||
|
||||
if (GetBlockIdx() == 0) {
|
||||
expertCountTempGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
|
||||
Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2,
|
||||
tilingData->actualExpertNum);
|
||||
InitGlobalMemory(expertCountTempGm, tilingData->actualExpertNum, 0);
|
||||
SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
|
||||
}
|
||||
|
||||
int64_t coreNum = GetBlockNum();
|
||||
|
||||
// key and value
|
||||
int64_t kvFactor = 2;
|
||||
int64_t buffSize = this->sortNum * sizeof(int32_t) * kvFactor;
|
||||
pipe->InitBuffer(sortDataCopyInQueue, bufferNum, buffSize);
|
||||
pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, buffSize);
|
||||
pipe->InitBuffer(tempBuffer, buffSize);
|
||||
pipe->InitBuffer(sortedBuffer, buffSize);
|
||||
}
|
||||
|
||||
__aicore__ inline void MoeSortOneCore::Process()
|
||||
{
|
||||
if (GetBlockIdx() < 1) {
|
||||
CopyIn();
|
||||
SortCompute();
|
||||
CopyOut();
|
||||
}
|
||||
this->SyncAll();
|
||||
}
|
||||
} // namespace MoeInitRoutingCustom
|
||||
#endif // MOE_CUSTOM_SORT_ONE_CORE_H
|
||||
@@ -0,0 +1,412 @@
|
||||
/**
|
||||
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
|
||||
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
|
||||
* CANN Open Software License Agreement Version 2.0 (the "License").
|
||||
* Please refer to the License for details. You may not use this file except in compliance with the License.
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See LICENSE in the root of the software repository for the full text of the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \file moe_init_routing_custom.cpp
|
||||
* \brief
|
||||
*/
|
||||
#include "moe_custom_mrgsort_out.h"
|
||||
#include "moe_custom_mrgsort.h"
|
||||
#include "moe_custom_sort_one_core.h"
|
||||
#include "moe_custom_sort_multi_core.h"
|
||||
#include "moe_custom_gather_sort_multi_core.h"
|
||||
#include "moe_custom_expert_tokens_count.h"
|
||||
#include "moe_custom_row_idx_gather.h"
|
||||
#include "moe_custom_gather_out.h"
|
||||
#include "moe_custom_gather_dynamic_quant.h"
|
||||
#include "moe_custom_gather_static_quant.h"
|
||||
#include "moe_custom_full_load.h"
|
||||
#include "moe_custom_full_load_dynamic_quant.h"
|
||||
#include "moe_custom_full_load_static_quant.h"
|
||||
#include "moe_custom_full_load_unquantized.h"
|
||||
#include "moe_custom_sort_actual_expert.h"
|
||||
#include "moe_custom_sort_multi_core_performance.h"
|
||||
#include "moe_custom_row_idx_gather_droppad_dynamic.h"
|
||||
#include "moe_custom_row_idx_gather_droppad.h"
|
||||
#include "moe_custom_gather_out_droppad.h"
|
||||
#include "moe_custom_gather_droppad_static_quant.h"
|
||||
|
||||
#define MOE_INIT_ROUTING_CUSTOM_PERFORMANCE 2000000
|
||||
#define UNQUANTIZED_FULLLOAD 2100000
|
||||
#define STATIC_QUANT_FULLLOAD 2200000
|
||||
#define DYNAMIC_QUANT_GATHER_NO_SCALE_FULLLOAD 2300000
|
||||
#define DYNAMIC_QUANT_GATHER_1H_DIM_SCALE_FULLLOAD 2301000
|
||||
#define DYNAMIC_QUANT_GATHER_EH_SCALE_FULLLOAD 2302000
|
||||
#define DYNAMIC_QUANT_SCATTER_NO_SCALE_FULLLOAD 2310000
|
||||
#define DYNAMIC_QUANT_SCATTER_1H_SCALE_FULLLOAD 2311000
|
||||
#define DYNAMIC_QUANT_SCATTER_EH_SCALE_FULLLOAD 2312000
|
||||
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_NODROP 1000000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_SCATTER_NODROP 1001000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_NODROP 1100000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_SCATTER_NODROP 1101000
|
||||
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_NODROP 1020000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_SCATTER_NODROP 1021000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_NODROP 1120000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_SCATTER_NODROP 1121000
|
||||
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_NODROP 1010000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_SCATTER_NODROP 1011000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_NODROP 1110000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_SCATTER_NODROP 1111000
|
||||
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP 1000100
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP 1100100
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_DROP 1020100
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_DROP 1120100
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP 1010100
|
||||
#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP 1110100
|
||||
|
||||
#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER 1200000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER 1201000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER 1300000
|
||||
#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER 1301000
|
||||
|
||||
|
||||
using namespace AscendC;
|
||||
using namespace MoeInitRoutingCustom;
|
||||
extern "C" __global__ __aicore__ void moe_init_routing_custom(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset,
|
||||
GM_ADDR expandedX, GM_ADDR expandedRowIdx,
|
||||
GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale,
|
||||
GM_ADDR workspace, GM_ADDR tiling)
|
||||
{
|
||||
KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);
|
||||
if (g_coreType == AIC) {
|
||||
return;
|
||||
}
|
||||
|
||||
GET_TILING_DATA(tilingData, tiling);
|
||||
if (workspace == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
GM_ADDR userWS = GetUserWorkspace(workspace);
|
||||
if (userWS == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto t = &tilingData;
|
||||
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_PERFORMANCE)) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoad op;
|
||||
op.Init(x, expertIdx, scale, offset, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(DYNAMIC_QUANT_GATHER_NO_SCALE_FULLLOAD)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadDynamicQuant<DTYPE_X, GATHER, NO_SCALE> op;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(DYNAMIC_QUANT_GATHER_1H_DIM_SCALE_FULLLOAD)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadDynamicQuant<DTYPE_X, GATHER, SCALE_1H> op;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(DYNAMIC_QUANT_GATHER_EH_SCALE_FULLLOAD)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadDynamicQuant<DTYPE_X, GATHER, SCALE_EH> op;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(DYNAMIC_QUANT_SCATTER_NO_SCALE_FULLLOAD)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadDynamicQuant<DTYPE_X, SCATTER, NO_SCALE> op;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(DYNAMIC_QUANT_SCATTER_1H_SCALE_FULLLOAD)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadDynamicQuant<DTYPE_X, SCATTER, SCALE_1H> op;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(DYNAMIC_QUANT_SCATTER_EH_SCALE_FULLLOAD)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadDynamicQuant<DTYPE_X, SCATTER, SCALE_EH> op;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(UNQUANTIZED_FULLLOAD)) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadUnquantized<DTYPE_X> op;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(STATIC_QUANT_FULLLOAD)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe fullLoadPipe;
|
||||
MoeCustomFullLoadStaticQuant<DTYPE_X> op;
|
||||
op.Init(x, expertIdx, scale, offset, expandedX, expandedRowIdx, expertTokensCountOrCumsum, userWS, t,
|
||||
&fullLoadPipe);
|
||||
op.Process();
|
||||
fullLoadPipe.Destroy();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER)) {
|
||||
TPipe sortActualExpertPipe;
|
||||
MoeSortActualExpert<DTYPE_X> op;
|
||||
bool isFinished = false;
|
||||
op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
|
||||
&sortActualExpertPipe);
|
||||
isFinished = op.Process();
|
||||
sortActualExpertPipe.Destroy();
|
||||
if (isFinished) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
|
||||
TPipe gatherSortMultiCorePipe;
|
||||
MoeGatherSortMultiCore op;
|
||||
op.Init(expertIdx, expandedRowIdx, userWS, t, &gatherSortMultiCorePipe);
|
||||
op.Process();
|
||||
gatherSortMultiCorePipe.Destroy();
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
|
||||
TPipe mergeSortMultiCorePipe;
|
||||
MoeSortMultiCorePerformance op;
|
||||
op.Init(expandedRowIdx, userWS, t, &mergeSortMultiCorePipe);
|
||||
op.Process();
|
||||
mergeSortMultiCorePipe.Destroy();
|
||||
}
|
||||
|
||||
TPipe sortPipe;
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP)) {
|
||||
MoeSortOneCore op;
|
||||
op.Init(expertIdx, expandedRowIdx, userWS, t, &sortPipe);
|
||||
op.Process();
|
||||
} else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP)) {
|
||||
MoeSortMultiCore op;
|
||||
op.Init(expertIdx, expandedRowIdx, userWS, t, &sortPipe);
|
||||
op.Process();
|
||||
}
|
||||
sortPipe.Destroy();
|
||||
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
|
||||
TPipe histogramPipe;
|
||||
if (t->expertTokensNumType == CUMSUM_MODE) {
|
||||
ExpertTokensCount<CUMSUM_MODE> countOp;
|
||||
countOp.Init<true>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
|
||||
countOp.Process();
|
||||
histogramPipe.Destroy();
|
||||
} else if (t->expertTokensNumType == COUNT_MODE) {
|
||||
ExpertTokensCount<COUNT_MODE> countOp;
|
||||
countOp.Init<true>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
|
||||
countOp.Process();
|
||||
histogramPipe.Destroy();
|
||||
} else {
|
||||
ExpertTokensCount<KEY_VALUE_MODE> countOp;
|
||||
countOp.Init<true>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
|
||||
countOp.Process();
|
||||
histogramPipe.Destroy();
|
||||
}
|
||||
|
||||
} else {
|
||||
if (t->dropPadMode == 1 || t->ep == 1 || t->expertTokensNumFlag != EXERPT_TOKENS_NONE) {
|
||||
TPipe histogramPipe;
|
||||
if (t->expertTokensNumType == CUMSUM_MODE) {
|
||||
ExpertTokensCount<CUMSUM_MODE> countOp;
|
||||
countOp.Init<false>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
|
||||
countOp.Process();
|
||||
histogramPipe.Destroy();
|
||||
} else if (t->expertTokensNumType == COUNT_MODE) {
|
||||
ExpertTokensCount<COUNT_MODE> countOp;
|
||||
countOp.Init<false>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
|
||||
countOp.Process();
|
||||
histogramPipe.Destroy();
|
||||
} else {
|
||||
ExpertTokensCount<KEY_VALUE_MODE> countOp;
|
||||
countOp.Init<false>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
|
||||
countOp.Process();
|
||||
histogramPipe.Destroy();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP)) {
|
||||
TPipe rowIdxGatherDropPadPipe;
|
||||
MoeCustomSrcToDstWithCapacity<DTYPE_X, MoeInitRoutingCustomTilingData> rowIdxGatherDropPadOp;
|
||||
rowIdxGatherDropPadOp.Init(expandedRowIdx, expandedX, expandedScale, userWS, t, &rowIdxGatherDropPadPipe);
|
||||
rowIdxGatherDropPadOp.Process();
|
||||
rowIdxGatherDropPadPipe.Destroy();
|
||||
} else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP)) {
|
||||
TPipe rowIdxGatherDropPadPipe;
|
||||
MoeCustomSrcToDstWithCapacity<int8_t, MoeInitRoutingCustomTilingData> rowIdxGatherDropPadOp;
|
||||
rowIdxGatherDropPadOp.Init(expandedRowIdx, expandedX, expandedScale, userWS, t, &rowIdxGatherDropPadPipe);
|
||||
rowIdxGatherDropPadOp.Process();
|
||||
rowIdxGatherDropPadPipe.Destroy();
|
||||
} else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_DROP)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe gatherPipe;
|
||||
MoeCustomSrcToDstAndGather<DTYPE_X, MoeInitRoutingCustomTilingData> gatherDroppadDynamicQuantOp;
|
||||
gatherDroppadDynamicQuantOp.Init(x, scale, expandedRowIdx, expandedX, expandedScale, userWS, t,
|
||||
&gatherPipe);
|
||||
gatherDroppadDynamicQuantOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
}
|
||||
} else {
|
||||
TPipe rowIdxPipe;
|
||||
RowIdxGather rowIdxGatherOp;
|
||||
rowIdxGatherOp.Init(expandedRowIdx, userWS, t, &rowIdxPipe);
|
||||
rowIdxGatherOp.Process();
|
||||
rowIdxPipe.Destroy();
|
||||
}
|
||||
|
||||
if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
|
||||
TPipe gatherPipe;
|
||||
if (t->ep == 1) {
|
||||
MoeGatherOut<DTYPE_X, 1> gatherOp;
|
||||
gatherOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
|
||||
gatherOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
} else {
|
||||
MoeGatherOut<DTYPE_X, 0> gatherOp;
|
||||
gatherOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
|
||||
gatherOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
}
|
||||
|
||||
} else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_NODROP)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe gatherPipe;
|
||||
if (t->ep == 0 and t->smoothType != SCALE_EH) {
|
||||
MoeGatherOutDynamicQuant<DTYPE_X, GATHER> gatherDynamicQuantOp;
|
||||
gatherDynamicQuantOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
|
||||
gatherDynamicQuantOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
} else {
|
||||
MoeGatherOutDynamicQuant<DTYPE_X, SCATTER> gatherDynamicQuantOp;
|
||||
gatherDynamicQuantOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
|
||||
gatherDynamicQuantOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
}
|
||||
}
|
||||
} else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_SCATTER_NODROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_NODROP)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe gatherPipe;
|
||||
if (t->ep == 1) {
|
||||
MoeGatherOutQuant<DTYPE_X, 1> gatherStaticQuantOp;
|
||||
gatherStaticQuantOp.Init(x, scale, offset, expandedRowIdx, expandedX, userWS, t, &gatherPipe);
|
||||
gatherStaticQuantOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
} else {
|
||||
MoeGatherOutQuant<DTYPE_X, 0> gatherStaticQuantOp;
|
||||
gatherStaticQuantOp.Init(x, scale, offset, expandedRowIdx, expandedX, userWS, t, &gatherPipe);
|
||||
gatherStaticQuantOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
}
|
||||
}
|
||||
} else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP)) {
|
||||
TPipe gatherPipe;
|
||||
MoeGatherOutDroppad<DTYPE_X> gatherDroppadOp;
|
||||
gatherDroppadOp.Init(x, scale, expandedRowIdx, expandedX, expandedScale, userWS, t, &gatherPipe);
|
||||
gatherDroppadOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
} else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP) ||
|
||||
TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP)) {
|
||||
if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
|
||||
TPipe gatherPipe;
|
||||
MoeGatherDroppadQuant<DTYPE_X> gatherDroppadStaticQuantOp;
|
||||
gatherDroppadStaticQuantOp.Init(x, scale, offset, expandedRowIdx, expandedX, userWS, t, &gatherPipe);
|
||||
gatherDroppadStaticQuantOp.Process();
|
||||
gatherPipe.Destroy();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1118,6 +1118,106 @@ at::Tensor combine_prefill(const at::Tensor& x, const at::Tensor& topk_idx, cons
|
||||
return combined_x;
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_moe_init_routing_custom(
|
||||
const at::Tensor &x, const at::Tensor &expert_idx,
|
||||
const c10::optional<at::Tensor> &scale, const c10::optional<at::Tensor> &offset, int64_t active_num,
|
||||
int64_t expert_capacity, int64_t expert_num, int64_t drop_pad_mode, int64_t expert_tokens_num_type,
|
||||
bool expert_tokens_num_flag, int64_t quant_mode, at::IntArrayRef active_expert_range, int64_t row_idx_type)
|
||||
{
|
||||
constexpr int64_t DIM_X = 2;
|
||||
constexpr int64_t DIM_EXPERT_IDX = 2;
|
||||
constexpr int64_t LENGTH_ACTIVE_EXPERT_RANGE = 2;
|
||||
constexpr int64_t EXPERT_TOKENS_COUNT = 1;
|
||||
constexpr int64_t EXPERT_TOKENS_KEY_VALUE = 2;
|
||||
constexpr int64_t QUANT_MODE_UNQUANT = -1;
|
||||
constexpr int64_t QUANT_MODE_DYNAMIC_QUANT = 1;
|
||||
constexpr int64_t CUMSUM = 0;
|
||||
constexpr int64_t COUNT = 1;
|
||||
constexpr int64_t KEY_VALUE = 2;
|
||||
|
||||
if (active_expert_range.empty()) {
|
||||
active_expert_range = at::IntArrayRef({0, expert_num});
|
||||
}
|
||||
|
||||
int64_t x_dim = x.dim();
|
||||
TORCH_CHECK(x_dim == DIM_X, "The x should be ", DIM_X,
|
||||
"-Dimension, current is ", x_dim, "-Dimension.");
|
||||
|
||||
int64_t expert_idx_dim = expert_idx.dim();
|
||||
TORCH_CHECK(expert_idx_dim == DIM_EXPERT_IDX, "The expert_idx should be ", DIM_EXPERT_IDX,
|
||||
"-Dimension, current is ", expert_idx_dim, "-Dimension.");
|
||||
|
||||
int64_t active_expert_range_length = active_expert_range.size();
|
||||
TORCH_CHECK(active_expert_range_length == LENGTH_ACTIVE_EXPERT_RANGE, "The active_expert_range should be ", LENGTH_ACTIVE_EXPERT_RANGE,
|
||||
"-Dimension, current is ", expert_idx_dim, "-Dimension.");
|
||||
|
||||
int expert_length = active_expert_range[1] - active_expert_range[0];
|
||||
auto x_size = x.sizes();
|
||||
auto expert_idx_size = expert_idx.sizes();
|
||||
|
||||
int bs = x_size[0];
|
||||
int h = x_size[1];
|
||||
int k = expert_idx_size[1];
|
||||
int64_t expanded_scale_len = 0;
|
||||
at::Tensor expanded_x;
|
||||
|
||||
if (drop_pad_mode == 1) { // Drop/Pad
|
||||
if (quant_mode == QUANT_MODE_UNQUANT) {
|
||||
expanded_x = at::empty({expert_num, expert_capacity, h}, x.options());
|
||||
} else {
|
||||
expanded_x = at::empty({expert_num, expert_capacity, h}, x.options().dtype(at::kChar));
|
||||
}
|
||||
expanded_scale_len = expert_num * expert_capacity;
|
||||
} else { // Dropless / Active
|
||||
if (active_num > 0) { // Active
|
||||
int64_t num_out_tokens = std::min((int64_t)bs * k, active_num);
|
||||
if (quant_mode == QUANT_MODE_UNQUANT) {
|
||||
expanded_x = at::empty({num_out_tokens, h}, x.options());
|
||||
} else {
|
||||
expanded_x = at::empty({num_out_tokens, h}, x.options().dtype(at::kChar));
|
||||
}
|
||||
expanded_scale_len = num_out_tokens;
|
||||
} else { // Dropless
|
||||
if (quant_mode == QUANT_MODE_UNQUANT) {
|
||||
expanded_x = at::empty({bs * k, h}, x.options());
|
||||
} else {
|
||||
expanded_x = at::empty({bs * k, h}, x.options().dtype(at::kChar));
|
||||
}
|
||||
expanded_scale_len = bs * k;
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor expanded_row_idx = at::empty({bs * k}, expert_idx.options());
|
||||
at::Tensor expert_tokens_count_or_cumsum;
|
||||
if (expert_tokens_num_type >= CUMSUM && expert_tokens_num_type <= COUNT) {
|
||||
// expert_tokens_count_or_cumsum in [end-start, ]
|
||||
expert_tokens_count_or_cumsum = at::empty({expert_length}, x.options().dtype(at::kLong));
|
||||
} else if (expert_tokens_num_type == KEY_VALUE) {
|
||||
// key_value in [2, end-start]
|
||||
expert_tokens_count_or_cumsum = at::empty({expert_num, 2}, x.options().dtype(at::kLong));
|
||||
}
|
||||
at::Tensor expanded_scale = at::empty({expanded_scale_len}, x.options().dtype(at::kFloat));
|
||||
EXEC_NPU_CMD(aclnnMoeInitRoutingCustom,
|
||||
x,
|
||||
expert_idx,
|
||||
scale,
|
||||
offset,
|
||||
active_num,
|
||||
expert_capacity,
|
||||
expert_num,
|
||||
drop_pad_mode,
|
||||
expert_tokens_num_type,
|
||||
expert_tokens_num_flag,
|
||||
quant_mode,
|
||||
active_expert_range,
|
||||
row_idx_type,
|
||||
expanded_x,
|
||||
expanded_row_idx,
|
||||
expert_tokens_count_or_cumsum,
|
||||
expanded_scale);
|
||||
return std::tie(expanded_x, expanded_row_idx, expert_tokens_count_or_cumsum, expanded_scale);
|
||||
}
|
||||
|
||||
} // namespace vllm_ascend
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
|
||||
@@ -1257,4 +1357,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
|
||||
"num_ranks) -> Tensor");
|
||||
ops.impl("combine_prefill", torch::kPrivateUse1,
|
||||
&vllm_ascend::combine_prefill);
|
||||
ops.def(
|
||||
"npu_moe_init_routing_custom(Tensor x, Tensor expert_idx, *, Tensor? scale=None, Tensor? offset=None, int active_num=-1, "
|
||||
" int expert_capacity=-1, int expert_num=-1, int drop_pad_mode=0, int expert_tokens_num_type=0, "
|
||||
" bool expert_tokens_num_flag=False, int quant_mode=0, int[2] active_expert_range=[], "
|
||||
" int row_idx_type=0) -> (Tensor, Tensor, Tensor, Tensor)"
|
||||
);
|
||||
ops.impl("npu_moe_init_routing_custom", torch::kPrivateUse1, &vllm_ascend::npu_moe_init_routing_custom);
|
||||
}
|
||||
|
||||
@@ -283,6 +283,89 @@ std::tuple<at::Tensor, at::Tensor> matmul_allreduce_add_rmsnorm_meta(
|
||||
return {output, add_out};
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_moe_init_routing_custom_meta(
|
||||
const at::Tensor &x, const at::Tensor &expert_idx,
|
||||
const c10::optional<at::Tensor> &scale, const c10::optional<at::Tensor> &offset, int64_t active_num,
|
||||
int64_t expert_capacity, int64_t expert_num, int64_t drop_pad_mode, int64_t expert_tokens_num_type,
|
||||
bool expert_tokens_num_flag, int64_t quant_mode, at::IntArrayRef active_expert_range, int64_t row_idx_type)
|
||||
{
|
||||
constexpr int64_t DIM_X = 2;
|
||||
constexpr int64_t DIM_EXPERT_IDX = 2;
|
||||
constexpr int64_t LENGTH_ACTIVE_EXPERT_RANGE = 2;
|
||||
constexpr int64_t EXPERT_TOKENS_COUNT = 1;
|
||||
constexpr int64_t EXPERT_TOKENS_KEY_VALUE = 2;
|
||||
constexpr int64_t QUANT_MODE_UNQUANT = -1;
|
||||
constexpr int64_t QUANT_MODE_DYNAMIC_QUANT = 1;
|
||||
constexpr int64_t CUMSUM = 0;
|
||||
constexpr int64_t COUNT = 1;
|
||||
constexpr int64_t KEY_VALUE = 2;
|
||||
|
||||
if (active_expert_range.empty()) {
|
||||
active_expert_range = at::IntArrayRef({0, expert_num});
|
||||
}
|
||||
|
||||
int64_t x_dim = x.dim();
|
||||
TORCH_CHECK(x_dim == DIM_X, "The x should be ", DIM_X,
|
||||
"-Dimension, current is ", x_dim, "-Dimension.");
|
||||
|
||||
int64_t expert_idx_dim = expert_idx.dim();
|
||||
TORCH_CHECK(expert_idx_dim == DIM_EXPERT_IDX, "The expert_idx should be ", DIM_EXPERT_IDX,
|
||||
"-Dimension, current is ", expert_idx_dim, "-Dimension.");
|
||||
|
||||
int64_t active_expert_range_length = active_expert_range.size();
|
||||
TORCH_CHECK(active_expert_range_length == LENGTH_ACTIVE_EXPERT_RANGE, "The active_expert_range should be ", LENGTH_ACTIVE_EXPERT_RANGE,
|
||||
"-Dimension, current is ", expert_idx_dim, "-Dimension.");
|
||||
|
||||
int expert_length = active_expert_range[1] - active_expert_range[0];
|
||||
auto x_size = x.sizes();
|
||||
auto expert_idx_size = expert_idx.sizes();
|
||||
|
||||
int bs = x_size[0];
|
||||
int h = x_size[1];
|
||||
int k = expert_idx_size[1];
|
||||
int64_t expanded_scale_len = 0;
|
||||
at::Tensor expanded_x;
|
||||
|
||||
if (drop_pad_mode == 1) { // Drop/Pad
|
||||
if (quant_mode == QUANT_MODE_UNQUANT) {
|
||||
expanded_x = at::empty({expert_num, expert_capacity, h}, x.options());
|
||||
} else {
|
||||
expanded_x = at::empty({expert_num, expert_capacity, h}, x.options().dtype(at::kChar));
|
||||
}
|
||||
expanded_scale_len = expert_num * expert_capacity;
|
||||
} else { // Dropless / Active
|
||||
if (active_num > 0) { // Active
|
||||
int64_t num_out_tokens = std::min((int64_t)bs * k, active_num);
|
||||
if (quant_mode == QUANT_MODE_UNQUANT) {
|
||||
expanded_x = at::empty({num_out_tokens, h}, x.options());
|
||||
} else {
|
||||
expanded_x = at::empty({num_out_tokens, h}, x.options().dtype(at::kChar));
|
||||
}
|
||||
expanded_scale_len = num_out_tokens;
|
||||
} else { // Dropless
|
||||
if (quant_mode == QUANT_MODE_UNQUANT) {
|
||||
expanded_x = at::empty({bs * k, h}, x.options());
|
||||
} else {
|
||||
expanded_x = at::empty({bs * k, h}, x.options().dtype(at::kChar));
|
||||
}
|
||||
expanded_scale_len = bs * k;
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor expanded_row_idx = at::empty({bs * k}, expert_idx.options());
|
||||
at::Tensor expert_tokens_count_or_cumsum;
|
||||
if (expert_tokens_num_type >= CUMSUM && expert_tokens_num_type <= COUNT) {
|
||||
// expert_tokens_count_or_cumsum in [end-start, ]
|
||||
expert_tokens_count_or_cumsum = at::empty({expert_length}, x.options().dtype(at::kLong));
|
||||
} else if (expert_tokens_num_type == KEY_VALUE) {
|
||||
// key_value in [2, end-start]
|
||||
expert_tokens_count_or_cumsum = at::empty({expert_num, 2}, x.options().dtype(at::kLong));
|
||||
}
|
||||
|
||||
at::Tensor expanded_scale = at::empty({expanded_scale_len}, x.options().dtype(at::kFloat));
|
||||
return {expanded_x, expanded_row_idx, expert_tokens_count_or_cumsum, expanded_scale};
|
||||
}
|
||||
|
||||
} // namespace meta
|
||||
} // namespace vllm_ascend
|
||||
|
||||
@@ -316,5 +399,7 @@ TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
|
||||
ops.impl("dispatch_ffn_combine", &vllm_ascend::meta::dispatch_ffn_combine_meta);
|
||||
// matmul allreduce add rmsnorm
|
||||
ops.impl("matmul_allreduce_add_rmsnorm", &vllm_ascend::meta::matmul_allreduce_add_rmsnorm_meta);
|
||||
// moe_init_routing_custom
|
||||
ops.impl("npu_moe_init_routing_custom", &vllm_ascend::meta::npu_moe_init_routing_custom_meta);
|
||||
}
|
||||
}
|
||||
|
||||
349
tests/e2e/nightly/ops/test_moe_init_routing_custom.py
Normal file
349
tests/e2e/nightly/ops/test_moe_init_routing_custom.py
Normal file
@@ -0,0 +1,349 @@
|
||||
import itertools
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm_ascend.utils import enable_custom_op
|
||||
|
||||
enable_custom_op()
|
||||
|
||||
|
||||
def adapter_capacity(sorted_row_idx, sorted_expert_idx, capacity):
|
||||
count = 0
|
||||
last = sorted_expert_idx[0]
|
||||
for i, val in enumerate(sorted_expert_idx):
|
||||
if last != val:
|
||||
count = 1
|
||||
last = val
|
||||
else:
|
||||
count += 1
|
||||
if count > capacity:
|
||||
sorted_expert_idx[i] = -1
|
||||
sorted_row_idx[i] = -1
|
||||
|
||||
|
||||
def moe_init_routing_golden(x, expert_idx, scale, offset, active_num,
|
||||
expert_capacity, expert_num, drop_pad_mode,
|
||||
expert_tokens_num_type, expert_tokens_num_flag,
|
||||
active_expert_range, quant_mode, row_idx_type):
|
||||
if drop_pad_mode == 1:
|
||||
if expert_num <= 0:
|
||||
print("expert num can not be 0")
|
||||
return
|
||||
expert_start = active_expert_range[0] if drop_pad_mode == 0 else 0
|
||||
expert_end = active_expert_range[1] if drop_pad_mode == 0 else expert_num
|
||||
num_rows = x.shape[0]
|
||||
h = x.shape[1]
|
||||
k = expert_idx.shape[-1]
|
||||
expert_idx_in = expert_idx.copy().reshape(-1)
|
||||
actual_expert_total_num: int = np.sum((expert_idx_in >= expert_start)
|
||||
& (expert_idx_in < expert_end))
|
||||
|
||||
expert_idx_in[(expert_idx_in
|
||||
< expert_start)] = np.int32(np.iinfo(np.int32).max)
|
||||
sorted_expert_indices = np.argsort(expert_idx_in, axis=-1, kind="stable")
|
||||
sorted_expert_idx = expert_idx_in[sorted_expert_indices]
|
||||
if row_idx_type == 1:
|
||||
expanded_row_idx = sorted_expert_indices[:actual_expert_total_num]
|
||||
else:
|
||||
expanded_row_idx = np.ones(num_rows * k).astype(np.int32) * -1
|
||||
tmp_indices = np.arange(actual_expert_total_num)
|
||||
expanded_row_idx[
|
||||
sorted_expert_indices[:actual_expert_total_num]] = tmp_indices
|
||||
|
||||
if not expert_tokens_num_flag:
|
||||
expert_tokens_count = torch.tensor([0])
|
||||
else:
|
||||
if drop_pad_mode == 0:
|
||||
if expert_tokens_num_type == 1:
|
||||
expert_tokens_count = np.bincount(
|
||||
sorted_expert_idx[:actual_expert_total_num] - expert_start)
|
||||
expert_tokens_count = np.concatenate([
|
||||
expert_tokens_count,
|
||||
np.zeros((expert_end - expert_start) -
|
||||
len(expert_tokens_count)).astype(np.int64)
|
||||
])
|
||||
elif expert_tokens_num_type == 0:
|
||||
expert_tokens_count = np.bincount(
|
||||
sorted_expert_idx[:actual_expert_total_num] - expert_start)
|
||||
expert_tokens_count = np.concatenate([
|
||||
expert_tokens_count,
|
||||
np.zeros((expert_end - expert_start) -
|
||||
len(expert_tokens_count)).astype(np.int64)
|
||||
])
|
||||
expert_tokens_count = np.cumsum(expert_tokens_count)
|
||||
elif expert_tokens_num_type == 2:
|
||||
expert_id, counts = np.unique(
|
||||
sorted_expert_idx[:actual_expert_total_num],
|
||||
return_counts=True)
|
||||
expert_tokens_count = np.column_stack((expert_id, counts))
|
||||
if expert_tokens_count.shape[0] < expert_num:
|
||||
expert_tokens_count = np.concatenate(
|
||||
(expert_tokens_count, [
|
||||
[0, 0],
|
||||
]), axis=0)
|
||||
else:
|
||||
expert_tokens_count = np.bincount(
|
||||
sorted_expert_idx[:actual_expert_total_num] - expert_start)
|
||||
zeros_array = np.zeros(
|
||||
(expert_end - expert_start) - len(expert_tokens_count),
|
||||
dtype=np.int64)
|
||||
expert_tokens_count = np.concatenate(
|
||||
[expert_tokens_count, zeros_array])
|
||||
expert_tokens_count = expert_tokens_count.astype(np.int64)
|
||||
|
||||
if drop_pad_mode == 0:
|
||||
if active_num == 0:
|
||||
active_num = actual_expert_total_num
|
||||
else:
|
||||
active_num = min(active_num, actual_expert_total_num)
|
||||
expanded_scale = None
|
||||
expanded_x = x[sorted_expert_indices[:active_num] // k, :]
|
||||
if scale is not None and quant_mode == -1:
|
||||
expanded_scale = scale[sorted_expert_indices[:active_num] // k]
|
||||
else:
|
||||
adapter_capacity(sorted_expert_indices, sorted_expert_idx,
|
||||
expert_capacity)
|
||||
|
||||
sort_row_tmp = np.full((expert_num * expert_capacity), -1, dtype=int)
|
||||
offset_tmp = 0
|
||||
lastExpertId = 0
|
||||
for i, val in enumerate(sorted_expert_indices):
|
||||
if val != -1:
|
||||
if lastExpertId != sorted_expert_idx[i]:
|
||||
offset_tmp = 0
|
||||
lastExpertId = sorted_expert_idx[i]
|
||||
sort_row_tmp[sorted_expert_idx[i] * expert_capacity +
|
||||
offset_tmp] = sorted_expert_indices[i]
|
||||
offset_tmp = offset_tmp + 1
|
||||
|
||||
expanded_row_idx = np.full(sorted_expert_indices.shape, -1)
|
||||
for i, val in enumerate(sort_row_tmp):
|
||||
if val != -1:
|
||||
expanded_row_idx[val] = i
|
||||
|
||||
expanded_x_mask = np.full((expert_num * expert_capacity, h),
|
||||
1,
|
||||
dtype=int)
|
||||
expanded_x = np.full((expert_num * expert_capacity, h),
|
||||
0,
|
||||
dtype=x.dtype)
|
||||
for i, val in enumerate(sort_row_tmp):
|
||||
if val != -1:
|
||||
expanded_x[i] = x[val // k]
|
||||
expanded_x_mask[i] = np.full((h, ), 0, dtype=int)
|
||||
|
||||
if quant_mode == -1:
|
||||
expanded_x = expanded_x
|
||||
expanded_row_idx = expanded_row_idx
|
||||
if scale is not None and drop_pad_mode == 1:
|
||||
expanded_scale = np.full((expert_num * expert_capacity, ),
|
||||
0,
|
||||
dtype=scale.dtype)
|
||||
for i, val in enumerate(sort_row_tmp):
|
||||
if val != -1:
|
||||
expanded_scale[i] = scale[val // k]
|
||||
if scale is None:
|
||||
expanded_scale = None
|
||||
|
||||
if quant_mode == 0:
|
||||
expanded_scale = None
|
||||
expanded_x_fp16 = expanded_x.astype(np.float16)
|
||||
if scale is not None:
|
||||
scale_val = scale.astype(np.float16)
|
||||
else:
|
||||
raise ValueError("scale cannot be None when quant_mode is 0")
|
||||
if offset is not None:
|
||||
offset_val = offset.astype(np.float16)
|
||||
else:
|
||||
raise ValueError("offset cannot be None when quant_mode is 0")
|
||||
scale_rst = expanded_x_fp16 * scale_val[0]
|
||||
add_offset = scale_rst + offset_val[0]
|
||||
round_data = np.rint(add_offset)
|
||||
round_data = np.clip(round_data, -128, 127)
|
||||
expanded_x = round_data.astype(np.int8)
|
||||
|
||||
if quant_mode == 1:
|
||||
x_final = expanded_x.astype(np.float32)
|
||||
if scale is None:
|
||||
x_abs = np.abs(x_final)
|
||||
x_max = np.max(x_abs, axis=-1, keepdims=True)
|
||||
expanded_scale = x_max / 127
|
||||
expanded_x = x_final / expanded_scale
|
||||
expanded_x = np.round(expanded_x).astype(np.int8)
|
||||
else:
|
||||
if scale.shape[0] == 1:
|
||||
x_final = x_final * scale
|
||||
else:
|
||||
if drop_pad_mode == 0:
|
||||
x_final = x_final * scale[sorted_expert_idx[:active_num] -
|
||||
expert_start]
|
||||
|
||||
else:
|
||||
for i, val in enumerate(sort_row_tmp):
|
||||
if val != -1:
|
||||
x_final[i] = x_final[i] * scale[i //
|
||||
expert_capacity]
|
||||
x_abs = np.abs(x_final)
|
||||
x_max = np.max(x_abs, axis=-1, keepdims=True)
|
||||
expanded_scale = x_max / 127
|
||||
expanded_x = x_final / expanded_scale
|
||||
expanded_x = np.round(expanded_x).astype(np.int8)
|
||||
if x.dtype == np.int8:
|
||||
expanded_scale = None
|
||||
if drop_pad_mode == 1:
|
||||
expanded_x = np.ma.array(expanded_x, mask=expanded_x_mask).filled(0)
|
||||
expanded_x = expanded_x.reshape(expert_num, expert_capacity, h)
|
||||
|
||||
return expanded_x, expanded_row_idx, expert_tokens_count, expanded_scale
|
||||
|
||||
|
||||
def npu_pta(x, expert_idx, scale, offset, active_num, expert_capacity,
|
||||
expert_num, drop_pad_mode, expert_tokens_num_type,
|
||||
expert_tokens_num_flag, quant_mode, active_expert_range,
|
||||
row_idx_type):
|
||||
expanded_x, expanded_row_idx, expert_token_cumsum_or_count, expanded_scale = torch.ops._C_ascend.npu_moe_init_routing_custom(
|
||||
x,
|
||||
expert_idx,
|
||||
scale=scale,
|
||||
offset=offset,
|
||||
active_num=active_num,
|
||||
expert_capacity=expert_capacity,
|
||||
expert_num=expert_num,
|
||||
drop_pad_mode=drop_pad_mode,
|
||||
expert_tokens_num_type=expert_tokens_num_type,
|
||||
expert_tokens_num_flag=expert_tokens_num_flag,
|
||||
quant_mode=quant_mode,
|
||||
active_expert_range=active_expert_range,
|
||||
row_idx_type=row_idx_type)
|
||||
|
||||
return expanded_x, expanded_row_idx, expert_token_cumsum_or_count, expanded_scale
|
||||
|
||||
|
||||
def cmp_out_golden(x_golden, x_out, dtype):
|
||||
if dtype == 'int8':
|
||||
cmp = np.isclose(x_out.cpu().numpy()[:len(x_golden)], x_golden, atol=1)
|
||||
else:
|
||||
cmp = np.isclose(x_out.cpu().numpy()[:len(x_golden)],
|
||||
x_golden,
|
||||
rtol=1e-05,
|
||||
atol=1e-05)
|
||||
return np.all(cmp)
|
||||
|
||||
|
||||
def test_moe_npu(x, expert_idx, scale, offset, active_num, expert_capacity,
|
||||
expert_num, drop_pad_mode, expert_tokens_num_type,
|
||||
expert_tokens_num_flag, quant_mode, active_expert_range,
|
||||
row_idx_type):
|
||||
x_npu = x.npu()
|
||||
expert_idx_npu = expert_idx.npu()
|
||||
scale_npu = scale.npu() if scale is not None else None
|
||||
offset_npu = offset.npu() if offset is not None else None
|
||||
|
||||
x_numpy = x.numpy()
|
||||
expert_idx_numpy = expert_idx.numpy()
|
||||
scale_numpy = scale.numpy() if scale is not None else None
|
||||
offset_numpy = offset.numpy() if offset is not None else None
|
||||
|
||||
expanded_x_golden, expanded_row_idx_golden, expert_token_cumsum_or_count_golden, expanded_scale_golden = moe_init_routing_golden(
|
||||
x_numpy, expert_idx_numpy, scale_numpy, offset_numpy, active_num,
|
||||
expert_capacity, expert_num, drop_pad_mode, expert_tokens_num_type,
|
||||
expert_tokens_num_flag, active_expert_range, quant_mode, row_idx_type)
|
||||
|
||||
expanded_x, expanded_row_idx, expert_token_cumsum_or_count, expanded_scale = npu_pta(
|
||||
x_npu, expert_idx_npu, scale_npu, offset_npu, active_num,
|
||||
expert_capacity, expert_num, drop_pad_mode, expert_tokens_num_type,
|
||||
expert_tokens_num_flag, quant_mode, active_expert_range, row_idx_type)
|
||||
if quant_mode == -1:
|
||||
expanded_x_result = cmp_out_golden(expanded_x_golden, expanded_x,
|
||||
"float32")
|
||||
else:
|
||||
expanded_x_result = cmp_out_golden(expanded_x_golden, expanded_x,
|
||||
"int8")
|
||||
|
||||
expanded_row_idx_result = cmp_out_golden(expanded_row_idx_golden,
|
||||
expanded_row_idx, "int32")
|
||||
|
||||
if expert_tokens_num_flag:
|
||||
expert_tokens_result = cmp_out_golden(
|
||||
expert_token_cumsum_or_count_golden, expert_token_cumsum_or_count,
|
||||
"int64")
|
||||
else:
|
||||
expert_tokens_result = True
|
||||
|
||||
if quant_mode == 1 or (quant_mode == -1 and scale is not None):
|
||||
expand_scale_result = cmp_out_golden(expanded_scale_golden.flatten(),
|
||||
expanded_scale, "float32")
|
||||
else:
|
||||
expand_scale_result = True
|
||||
|
||||
compare_result = expanded_x_result and expanded_row_idx_result and expert_tokens_result and expand_scale_result
|
||||
# print('=======case result=======: ', compare_result)
|
||||
return compare_result
|
||||
|
||||
|
||||
def test_moe_init_routing_custom():
|
||||
failed_test_cnt = 0
|
||||
drop_pad_mode = [0, 1]
|
||||
expert_tokens_num_type = [0, 1, 2]
|
||||
expert_tokens_num_flag = [True, False]
|
||||
quant_mode = [0, 1, -1]
|
||||
row_idx_type = [0, 1]
|
||||
scale_type = [0, 1, 2]
|
||||
product_result = itertools.product(drop_pad_mode, expert_tokens_num_type,
|
||||
expert_tokens_num_flag, quant_mode,
|
||||
row_idx_type, scale_type)
|
||||
|
||||
for idx, (drop_pad_mode_, expert_tokens_num_type_, expert_tokens_num_flag_,
|
||||
quant_mode_, row_idx_type_,
|
||||
scale_type_) in enumerate(product_result, 5):
|
||||
expert_num_ = random.randint(2, 500)
|
||||
expert_start = random.randint(0, expert_num_ - 1)
|
||||
expert_end = random.randint(expert_start + 1, expert_num_)
|
||||
active_expert_range_ = [expert_start, expert_end]
|
||||
|
||||
N = random.randint(1, 100)
|
||||
H = random.randint(12, 100)
|
||||
K = random.randint(1, 12)
|
||||
x_ = torch.randn(N, H, dtype=torch.float16) * 5
|
||||
expert_capacity_ = random.randint(1, N - 1) if N > 1 else 1
|
||||
expert_idx_ = torch.randint(0,
|
||||
expert_num_ - 1, (N, K),
|
||||
dtype=torch.int32)
|
||||
active_num_ = N * K
|
||||
|
||||
if drop_pad_mode_ == 1:
|
||||
active_expert_range_ = [0, expert_num_]
|
||||
expert_tokens_num_type_ = 1
|
||||
row_idx_type_ = 0
|
||||
|
||||
if quant_mode_ == 0:
|
||||
scale_ = torch.randn(1, dtype=torch.float)
|
||||
offset_ = torch.randn(1, dtype=torch.float)
|
||||
elif quant_mode_ == -1:
|
||||
scale_ = None
|
||||
offset_ = None
|
||||
else:
|
||||
if scale_type_ == 0:
|
||||
scale_ = None
|
||||
offset_ = None
|
||||
elif scale_type_ == 1:
|
||||
scale_ = torch.randn(1, H, dtype=torch.float)
|
||||
offset_ = None
|
||||
else:
|
||||
scale_ = torch.randn(active_expert_range_[1] -
|
||||
active_expert_range_[0],
|
||||
H,
|
||||
dtype=torch.float)
|
||||
offset_ = None
|
||||
|
||||
result_pta = test_moe_npu(x_, expert_idx_, scale_, offset_,
|
||||
active_num_, expert_capacity_, expert_num_,
|
||||
drop_pad_mode_, expert_tokens_num_type_,
|
||||
expert_tokens_num_flag_, quant_mode_,
|
||||
active_expert_range_, row_idx_type_)
|
||||
if not result_pta:
|
||||
failed_test_cnt += 1
|
||||
|
||||
assert (failed_test_cnt == 0)
|
||||
Reference in New Issue
Block a user