[OP] add custom op aclnnMoeInitRoutingCustom (#5251)

### What this PR does / why we need it?  This pull request introduces a new custom operator `aclnnMoeInitRoutingCustom` for Mixture-of-Experts models. It can be replaced by `aclnnMoeInitRoutingV3` once CANN 8.5 becomes available. ### Does this PR introduce _any_ user-facing change?  No. ### How was this patch tested?  --------- Signed-off-by: jiazhengyi <jiazhengyi@huawei.com> Signed-off-by: Chenxi Qian <chenxi.qian.cq@outlook.com> Co-authored-by: jiazhengyi <jiazhengyi@huawei.com> Co-authored-by: Chenxi Qian <chenxi.qian.cq@outlook.com>
2025-12-29 19:29:40 +08:00
parent 92353c0643
commit d5f72835e6
40 changed files with 10815 additions and 1 deletions
--- a/csrc/build_aclnn.sh
+++ b/csrc/build_aclnn.sh
@@ -24,7 +24,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910b ]]; then
    ABSOLUTE_CATLASS_PATH=$(cd "${CATLASS_PATH}" && pwd)
    export CPATH=${ABSOLUTE_CATLASS_PATH}:${CPATH}

-    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm"
+    CUSTOM_OPS="grouped_matmul_swiglu_quant_weight_nz_tensor_list;lightning_indexer;sparse_flash_attention;matmul_allreduce_add_rmsnorm;moe_init_routing_custom"
    SOC_ARG="ascend910b"
 elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
    # ASCEND910C (A3) series
@@ -69,6 +69,7 @@ elif [[ "$SOC_VERSION" =~ ^ascend910_93 ]]; then
        "moe_dispatch_normal"
        "dispatch_layout"
        "notify_dispatch"
+        "moe_init_routing_custom"
    )
    CUSTOM_OPS=$(IFS=';'; echo "${CUSTOM_OPS_ARRAY[*]}")
    SOC_ARG="ascend910_93"
--- a/csrc/moe_init_routing_custom/op_host/CMakeLists.txt
+++ b/csrc/moe_init_routing_custom/op_host/CMakeLists.txt
@@ -0,0 +1,55 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+add_ops_compile_options(
+        OP_NAME MoeInitRoutingCustom
+        OPTIONS --cce-auto-sync=on
+                -Wno-deprecated-declarations
+                -Werror
+)
+
+target_sources(op_host_aclnnExc PRIVATE
+        moe_init_routing_custom_def.cpp
+)
+
+target_sources(opapi PRIVATE
+        moe_init_routing_custom.cpp
+        aclnn_moe_init_routing_custom.cpp
+)
+
+if (NOT BUILD_OPEN_PROJECT)
+    target_sources(aclnn_ops_train PRIVATE
+        moe_init_routing_custom.cpp
+        aclnn_moe_init_routing_custom.cpp
+    )
+
+    target_sources(aclnn_ops_infer PRIVATE
+        moe_init_routing_custom.cpp
+        aclnn_moe_init_routing_custom.cpp
+    )
+endif ()
+
+target_sources(optiling PRIVATE
+        moe_init_routing_custom_tiling_base.cpp
+        moe_init_routing_custom_tiling.cpp
+)
+
+target_include_directories(optiling PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+target_sources(opsproto PRIVATE
+        moe_init_routing_custom_infershape.cpp
+)
+
+file(GLOB _GMM_Aclnn_header "${CMAKE_CURRENT_SOURCE_DIR}/aclnn_moe_init_routing_custom.h")
+
+install(FILES ${_GMM_Aclnn_header}
+        DESTINATION ${ACLNN_INC_INSTALL_DIR} OPTIONAL
+)
--- a/csrc/moe_init_routing_custom/op_host/aclnn_moe_init_routing_custom.cpp
+++ b/csrc/moe_init_routing_custom/op_host/aclnn_moe_init_routing_custom.cpp
@@ -0,0 +1,143 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <algorithm>
+#include <tuple>
+#include <cstddef>
+#include "opdev/make_op_executor.h"
+#include "aclnn_kernels/contiguous.h"
+#include "opdev/tensor_view_utils.h"
+#include "aclnn_kernels/common/op_error_check.h"
+#include "opdev/op_log.h"
+#include "aclnn_kernels/cast.h"
+#include "opdev/common_types.h"
+#include "moe_init_routing_custom.h"
+#include "aclnn_moe_init_routing_custom.h"
+
+using namespace op;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+namespace {
+    static const int64_t MOE_DIM_2 = 2;
+    static const int64_t MOE_DIM_1 = 1;
+}
+
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_X= {DataType::DT_FLOAT16, DataType::DT_BF16, DataType::DT_FLOAT, DataType::DT_INT8};
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPERT_IDX = {DataType::DT_INT32};
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_SCALE = {DataType::DT_FLOAT};
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_OFFSET= {DataType::DT_FLOAT};
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_X_OUT = {DataType::DT_FLOAT16, DataType::DT_BF16, DataType::DT_FLOAT, DataType::DT_INT8};
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_ROW_IDX_OUT = {DataType::DT_INT32};
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPERT_TOKENS_COUNT_OR_CUMSUMOUT = {DataType::DT_INT64};
+static const std::initializer_list<DataType> DTYPE_SUPPORT_LIST_EXPANDED_SCALE_OUT = {DataType::DT_FLOAT};
+
+static inline bool CheckNotNull(const aclTensor *x, 
+                                const aclTensor *expertIdx,
+                                const aclTensor *expandedXOut, 
+                                const aclTensor *expandedRowIdxOut, 
+                                const aclTensor *expertTokensCountOrCumsumOut, 
+                                const aclTensor *expandedScaleOut) {
+    OP_CHECK_NULL(x, return false);
+    OP_CHECK_NULL(expertIdx, return false);
+    OP_CHECK_NULL(expandedXOut,  return false);
+    OP_CHECK_NULL(expandedRowIdxOut,  return false);
+    OP_CHECK_NULL(expertTokensCountOrCumsumOut, return false);
+    OP_CHECK_NULL(expandedScaleOut, return false);
+
+    return true;
+}
+
+aclnnStatus aclnnMoeInitRoutingCustomGetWorkspaceSize(const aclTensor *x, 
+                                                            const aclTensor *expertIdx,
+                                                            const aclTensor *scaleOptional,
+                                                            const aclTensor *offsetOptional, 
+                                                            int64_t activeNum, 
+                                                            int64_t expertCapacity, 
+                                                            int64_t expertNum, 
+                                                            int64_t dropPadMode, 
+                                                            int64_t expertTokensNumType, 
+                                                            bool expertTokensNumFlag, 
+                                                            int64_t quantMode, 
+                                                            const aclIntArray *activeExpertRangeOptional, 
+                                                            int64_t rowIdxType, 
+                                                            const aclTensor *expandedXOut, 
+                                                            const aclTensor *expandedRowIdxOut, 
+                                                            const aclTensor *expertTokensCountOrCumsumOut, 
+                                                            const aclTensor *expandedScaleOut, 
+                                                            uint64_t *workspaceSize, 
+                                                            aclOpExecutor **executor)                                                                                 
+{   
+    L2_DFX_PHASE_1(aclnnMoeInitRoutingCustom, 
+                    DFX_IN(x, expertIdx, scaleOptional, offsetOptional, 
+                            activeNum, expertCapacity, expertNum, dropPadMode, 
+                            expertTokensNumType, expertTokensNumFlag, quantMode, activeExpertRangeOptional, rowIdxType), 
+                    DFX_OUT(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut));
+    auto ret = CheckNotNull(x, expertIdx, expandedXOut, expandedRowIdxOut, 
+                            expertTokensCountOrCumsumOut, expandedScaleOut);
+
+    CHECK_RET(ret, ACLNN_ERR_PARAM_NULLPTR);
+
+    auto uniqueExecutor = CREATE_EXECUTOR();
+    CHECK_RET(uniqueExecutor.get() != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
+
+    auto xContiguous = l0op::Contiguous(x, uniqueExecutor.get()); 
+    CHECK_RET(xContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
+    auto expertIdxContiguous = l0op::Contiguous(expertIdx, uniqueExecutor.get()); 
+    CHECK_RET(expertIdxContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
+
+    const aclTensor* scaleContiguous = nullptr;
+    const aclTensor* offsetContiguous = nullptr;
+    if (scaleOptional != nullptr) {
+        scaleContiguous = l0op::Contiguous(scaleOptional, uniqueExecutor.get()); 
+        CHECK_RET(scaleContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
+    }
+
+    if (offsetOptional != nullptr) {
+        offsetContiguous = l0op::Contiguous(offsetOptional, uniqueExecutor.get()); 
+        CHECK_RET(offsetContiguous != nullptr, ACLNN_ERR_INNER_CREATE_EXECUTOR);
+    }
+
+    auto routingResult = std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(nullptr, nullptr, nullptr, nullptr);
+    routingResult = l0op::MoeInitRoutingCustom(xContiguous, expertIdxContiguous, scaleContiguous, offsetContiguous, 
+                                        activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag,
+                                        quantMode, activeExpertRangeOptional, rowIdxType, expandedXOut, expandedRowIdxOut, 
+                                        expertTokensCountOrCumsumOut, expandedScaleOut, uniqueExecutor.get());
+    auto [expandedXOut_, expandedRowIdxOut_, expertTokensCountOrCumsumOut_, expandedScaleOut_] = routingResult;
+    bool hasNullptr = (expandedXOut_ == nullptr) || (expandedRowIdxOut_ == nullptr) || (expertTokensCountOrCumsumOut_ == nullptr) || (expandedScaleOut_ == nullptr);
+    CHECK_RET(hasNullptr != true, ACLNN_ERR_INNER_NULLPTR);
+
+    auto viewCopyExpandedXOutResult = l0op::ViewCopy(expandedXOut_, expandedXOut, uniqueExecutor.get());
+    CHECK_RET(viewCopyExpandedXOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
+    auto viewCopyExpandedRowIdxOutResult = l0op::ViewCopy(expandedRowIdxOut_, expandedRowIdxOut, uniqueExecutor.get());
+    CHECK_RET(viewCopyExpandedRowIdxOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
+
+    auto viewCopyExpertTokensCountOrCumsumOutResult = l0op::ViewCopy(expertTokensCountOrCumsumOut_, expertTokensCountOrCumsumOut, uniqueExecutor.get());
+    CHECK_RET(viewCopyExpertTokensCountOrCumsumOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
+
+    auto viewCopyExpandedScaleOutResult = l0op::ViewCopy(expandedScaleOut_, expandedScaleOut, uniqueExecutor.get());
+    CHECK_RET(viewCopyExpandedScaleOutResult != nullptr, ACLNN_ERR_INNER_NULLPTR);
+
+    *workspaceSize = uniqueExecutor->GetWorkspaceSize();
+    uniqueExecutor.ReleaseTo(executor);
+    return ACLNN_SUCCESS;
+}
+aclnnStatus aclnnMoeInitRoutingCustom(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
+                                            aclrtStream stream)
+{
+  L2_DFX_PHASE_2(aclnnMoeInitRoutingCustom);
+  return CommonOpExecutorRun(workspace, workspaceSize, executor, stream);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/csrc/moe_init_routing_custom/op_host/aclnn_moe_init_routing_custom.h
+++ b/csrc/moe_init_routing_custom/op_host/aclnn_moe_init_routing_custom.h
@@ -0,0 +1,47 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+ 
+#ifndef OP_API_INC_MOE_INIT_ROUTING_CUSTOM_H_
+#define OP_API_INC_MOE_INIT_ROUTING_CUSTOM_H_
+
+#include "aclnn/aclnn_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+__attribute__((visibility("default"))) aclnnStatus aclnnMoeInitRoutingCustomGetWorkspaceSize(const aclTensor *x, 
+                                                            const aclTensor *expertIdx,
+                                                            const aclTensor *scaleOptional,
+                                                            const aclTensor *offsetOptional, 
+                                                            int64_t activeNum, 
+                                                            int64_t expertCapacity, 
+                                                            int64_t expertNum, 
+                                                            int64_t dropPadMode, 
+                                                            int64_t expertTokensNumType, 
+                                                            bool expertTokensNumFlag, 
+                                                            int64_t quantMode, 
+                                                            const aclIntArray *activeExpertRangeOptional, 
+                                                            int64_t rowIdxType, 
+                                                            const aclTensor *expandedXOut, 
+                                                            const aclTensor *expandedRowIdxOut, 
+                                                            const aclTensor *expertTokensCountOrCumsumOut, 
+                                                            const aclTensor *expandedScaleOut, 
+                                                            uint64_t *workspaceSize, 
+                                                            aclOpExecutor **executor);
+                                                            
+__attribute__((visibility("default"))) aclnnStatus aclnnMoeInitRoutingCustom(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
+                                                aclrtStream stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom.cpp
+++ b/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom.cpp
@@ -0,0 +1,50 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <tuple>
+#include "moe_init_routing_custom.h"
+#include "opdev/make_op_executor.h"
+#include "opdev/op_def.h"
+#include "opdev/op_dfx.h"
+#include "opdev/op_executor.h"
+#include "opdev/op_log.h"
+#include "opdev/shape_utils.h"
+#include "aclnn_kernels/common/op_error_check.h"
+
+using namespace op;
+
+namespace l0op {
+OP_TYPE_REGISTER(MoeInitRoutingCustom);
+
+std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*> MoeInitRoutingCustom(const aclTensor *x, const aclTensor *expertIdx, const aclTensor *scale, 
+                                                                            const aclTensor *offset, int64_t activeNum, int64_t expertCapacity,
+                                                                            int64_t expertNum, int64_t dropPadMode, int64_t expertTokensNumType, 
+                                                                            bool expertTokensNumFlag, int64_t quantMode, const aclIntArray *activeExpertRange,
+                                                                            int64_t rowIdxType, const aclTensor *expandedX, const aclTensor *expandedRowIdx, 
+                                                                            const aclTensor *expertTokensCountOrCumsum,  const aclTensor *expandedScale, aclOpExecutor *executor)
+{
+    L0_DFX(MoeInitRoutingCustom, x, expertIdx, scale,  offset, activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag,
+            quantMode, activeExpertRange, rowIdxType, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale);
+    
+    auto expandedXOut = executor->AllocTensor(expandedX->GetViewShape(), expandedX->GetDataType(), Format::FORMAT_ND); 
+    auto expandedRowIdxOut = executor->AllocTensor(expandedRowIdx->GetViewShape(), expandedRowIdx->GetDataType(), Format::FORMAT_ND);
+    auto expertTokensCountOrCumsumOut = executor->AllocTensor(expertTokensCountOrCumsum->GetViewShape(), expertTokensCountOrCumsum->GetDataType(), Format::FORMAT_ND);
+    auto expandedScaleOut = executor->AllocTensor(expandedScale->GetViewShape(), expandedScale->GetDataType(), Format::FORMAT_ND);
+    if (expandedXOut == nullptr || expandedRowIdxOut == nullptr || expertTokensCountOrCumsumOut == nullptr || expandedScaleOut == nullptr) {
+        OP_LOGE(ACLNN_ERR_INNER_NULLPTR, "alloc expandedXOut or expandedRowIdxOut or expertTokensCountOrCumsumOut or expandedScaleOut tensor failed.");
+        return std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(nullptr, nullptr, nullptr, nullptr);
+    }
+
+    ADD_TO_LAUNCHER_LIST_AICORE(
+        MoeInitRoutingCustom, OP_INPUT(x, expertIdx, scale, offset), OP_OUTPUT(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut), OP_ATTR(activeNum, expertCapacity, expertNum, dropPadMode, expertTokensNumType, expertTokensNumFlag, quantMode, activeExpertRange, rowIdxType));
+    return std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*>(expandedXOut, expandedRowIdxOut, expertTokensCountOrCumsumOut, expandedScaleOut); //OP_OUTPUT
+}
+
+}  // namespace l0op
--- a/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom.h
+++ b/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom.h
@@ -0,0 +1,25 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
+#define OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
+
+#include <tuple>
+#include "opdev/op_executor.h"
+
+namespace l0op {
+std::tuple<aclTensor*, aclTensor*, aclTensor*, aclTensor*> MoeInitRoutingCustom(const aclTensor *x, const aclTensor *expertIdx, const aclTensor *scale, 
+                                                                            const aclTensor *offset, int64_t activeNum, int64_t expertCapacity,
+                                                                            int64_t expertNum, int64_t dropPadMode, int64_t expertTokensNumType, 
+                                                                            bool expertTokensNumFlag, int64_t quantMode, const aclIntArray *activeExpertRange,
+                                                                            int64_t rowIdxType, const aclTensor *expandedX, const aclTensor *expandedRowIdx, 
+                                                                            const aclTensor *expertTokensCountOrCumsum,  const aclTensor *expandedScale, aclOpExecutor *executor); 
+}  // namespace l0op
+#endif // OP_API_INC_LEVEL0_MOE_INIT_ROUTING_CUSTOM_H
--- a/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_def.cpp
+++ b/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_def.cpp
@@ -0,0 +1,105 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_init_routing_v3_def.cpp
+ * \brief
+ */
+#include "register/op_def_registry.h"
+
+namespace ops {
+class MoeInitRoutingCustom : public OpDef {
+public:
+    explicit MoeInitRoutingCustom(const char *name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType(
+                {ge::DT_INT8, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("expert_idx")
+            .ParamType(REQUIRED)
+            .DataType(
+                {ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("scale")
+            .ParamType(OPTIONAL)
+            .DataType(
+                {ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Input("offset")
+            .ParamType(OPTIONAL)
+            .DataType(
+                {ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND})
+            .AutoContiguous();
+        this->Output("expanded_x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT8, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_FLOAT, ge::DT_INT8, ge::DT_INT8, ge::DT_INT8})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("expanded_row_idx")
+            .ParamType(REQUIRED)
+            .DataType(
+                {ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("expert_tokens_count_or_cumsum")
+            .ParamType(REQUIRED)
+            .DataType(
+                {ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64, ge::DT_INT64})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("expanded_scale")
+            .ParamType(REQUIRED)
+            .DataType(
+                {ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT, ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                     ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND,
+                                 ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Attr("active_num").AttrType(OPTIONAL).Int(-1);
+        this->Attr("expert_capacity").AttrType(OPTIONAL).Int(-1);
+        this->Attr("expert_num").AttrType(OPTIONAL).Int(-1);
+        this->Attr("drop_pad_mode").AttrType(OPTIONAL).Int(0);
+        this->Attr("expert_tokens_num_type").AttrType(OPTIONAL).Int(0);
+        this->Attr("expert_tokens_num_flag").AttrType(OPTIONAL).Bool(false);
+        this->Attr("quant_mode").AttrType(OPTIONAL).Int(-1);
+        this->Attr("active_expert_range").AttrType(OPTIONAL).ListInt({});
+        this->Attr("row_idx_type").AttrType(OPTIONAL).Int(0);
+        this->AICore().AddConfig("ascend910b");
+        this->AICore().AddConfig("ascend910_93");
+
+    }
+};
+
+OP_ADD(MoeInitRoutingCustom);
+} // namespace ops
--- a/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_infershape.cpp
+++ b/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_infershape.cpp
@@ -0,0 +1,797 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file moe_init_routing_custom_infershape.cpp
+ * \brief
+ */
+ 
+#include <sstream>
+#include <string>
+#include <vector>
+#include "register/op_def_registry.h"
+#include "log/ops_log.h"
+#include "platform/platform_info.h"
+
+#define unlikely(x) __builtin_expect((x), 0)
+#define OP_CHECK_NULL_WITH_CONTEXT(context, ptr)                                                           \
+    do {                                                                                                   \
+        if (unlikely((ptr) == nullptr)) {                                                                  \
+            const char* name = (unlikely(((context) == nullptr) || (context)->GetNodeName() == nullptr)) ? \
+                                   "nil" :                                                                 \
+                                   (context)->GetNodeName();                                               \
+            OPS_LOG_E(name, "%s is nullptr!", #ptr);                                                         \
+            return ge::GRAPH_FAILED;                                                                       \
+        }                                                                                                  \
+    } while (0)
+
+using namespace ge;
+namespace ops {
+static constexpr size_t DIM_ONE = 1U;
+static constexpr size_t DIM_TWO = 2U;
+static constexpr size_t DIM_THREE = 3U;
+static constexpr int64_t NEG_ONE = static_cast<int64_t>(-1);
+static constexpr int64_t NEG_TWO = static_cast<int64_t>(-2);
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_X = 0;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_EXPERT_IDX = 1;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_SCALE = 2;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_INPUT_OFFSET = 3;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_NUM = 0;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_CAPACITY = 1;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_NUM = 2;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_DROP_PAD_MODE = 3;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_TYPE = 4;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_FLAG = 5;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_QUANT_MODE = 6;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_EXPERT_RANGE = 7;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_ATTR_ROW_IDX_TYPE = 8;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X = 0;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX = 1;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT = 2;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE = 3;
+static constexpr int64_t MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND = 10240;
+static constexpr int64_t KEY_VALUE_MODE_DIM0_NUM = 2;
+enum DropPadMode : int8_t {
+    NO_DROP_PAD = 0,
+    DROP_PAD = 1,
+};
+enum QuantMode : int8_t {
+    NON_QUANT = -1,
+    STATIC_QUANT = 0,
+    DYNAMIC_QUANT = 1
+};
+enum ExpertTokenNumType : int8_t {
+    CUMSUM = 0,
+    COUNT = 1,
+    KEY_VALUE = 2
+};
+
+static bool isSameDim(int64_t dim1, int64_t dim2)
+{
+    if (dim1 <= NEG_ONE || dim2 <= NEG_ONE) {
+        return true;
+    }
+    return dim1 == dim2;
+}
+
+static ge::graphStatus GetAndCheckAttrActiveExpertRange(const gert::RuntimeAttrs *attrs,
+                                                        gert::InferShapeContext *context, int64_t &expertStart,
+                                                        int64_t &expertEnd, int64_t &experNum)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrActiveExpertRange.");
+    // Check if active_expert_range size is 2 and if expert_start < expert_end
+    auto activeExpertRangePtr = attrs->GetListInt(MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_EXPERT_RANGE);
+    if (nullptr == activeExpertRangePtr) {
+        OPS_LOG_E(context->GetNodeName(), "The active_expert_range should be list int. But it is none.");
+        return ge::GRAPH_FAILED;
+    }
+    int64_t activeExpertRangeSize = activeExpertRangePtr->GetSize();
+    if (activeExpertRangePtr->GetSize() == DIM_TWO) {
+        expertStart = activeExpertRangePtr->GetData()[0];
+        expertEnd = activeExpertRangePtr->GetData()[1];
+        if (expertStart >= expertEnd || expertStart < 0 || expertEnd > MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND) {
+            OPS_LOG_E(context->GetNodeName(),
+                    "The active_expert_range should be in [0, %ld), but the active_expert_range is [%ld, %ld).",
+                    MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND, expertStart, expertEnd);
+            return ge::GRAPH_FAILED;
+        }
+    } else if (activeExpertRangePtr->GetSize() == 0) {
+        expertStart = 0;
+        expertEnd = experNum;
+    } else {
+        OPS_LOG_E(context->GetNodeName(), "The active_expert_range size should be 2, but its size is %ld.", activeExpertRangeSize);
+        return ge::GRAPH_FAILED;
+    }
+
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrActiveExpertRange.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrActiveNum(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
+                                                int64_t &activeNum, int64_t &dropPadMode)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrActiveNum.");
+    const int64_t *activeNumPtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_ACTIVE_NUM);
+    if (nullptr == activeNumPtr) {
+        OPS_LOG_E(context->GetNodeName(), "The active_num should not be none.");
+        return ge::GRAPH_FAILED;
+    }
+    activeNum = *activeNumPtr;
+    if (dropPadMode == DropPadMode::NO_DROP_PAD && activeNum < -1) {
+    	OPS_LOG_E(context->GetNodeName(), "The active_num should be greater than or equal to 0. But it is %ld.", activeNum);
+        return ge::GRAPH_FAILED;
+    }
+
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrActiveNum.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrExpertCapacity(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
+                                                     const gert::Shape *xShape, int64_t &expertCapacity,
+                                                     int64_t &dropPadMode)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrExpertCapacity.");
+    const int64_t *expertCapacityPtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_CAPACITY);
+    if (nullptr == expertCapacityPtr) {
+        OPS_LOG_E(context->GetNodeName(), "The expert_capacity should not be none.");
+        return ge::GRAPH_FAILED;
+    }
+    expertCapacity = *expertCapacityPtr;
+    if (dropPadMode == DropPadMode::DROP_PAD && xShape->GetDim(0) > 0 && expertCapacity > xShape->GetDim(0)) {
+            OPS_LOG_E(context->GetNodeName(), "The expert_capacity should be between 0 and n. But it is %ld.", expertCapacity);
+            return ge::GRAPH_FAILED;
+    }
+
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertCapacity.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrExpertNum(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
+                                                int64_t &experNum)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckexperNum.");
+    const int64_t *experNumPtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_NUM);
+    if (nullptr == experNumPtr) {
+        OPS_LOG_E(context->GetNodeName(), "The expert_num should not be none.");
+        return ge::GRAPH_FAILED;
+    }
+    experNum = *experNumPtr;
+    if (experNum <= 0 || experNum > MOE_INIT_ROUTING_CUSTOM_EXPERT_END_BOUND) {
+        OPS_LOG_E(context->GetNodeName(), "The expert_num should be greater than 0. But it is %ld.", experNum);
+        return ge::GRAPH_FAILED;
+    }
+
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertNum.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrDropPadMode(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
+                                                  int64_t &dropPadMode)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrDropPadMode.");
+    const int64_t *dropPadModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_DROP_PAD_MODE);
+    if (nullptr == dropPadModePtr) {
+        OPS_LOG_E(context->GetNodeName(), "The RuntimeAttrs for drop_pad_mode is none.");
+        return ge::GRAPH_FAILED;
+    }
+
+    dropPadMode = *dropPadModePtr;
+    if (dropPadMode < DropPadMode::NO_DROP_PAD || dropPadMode > DropPadMode::DROP_PAD) {
+        OPS_LOG_E(context->GetNodeName(), "The drop_pad_mode should be %d or %d. But it is %ld.", DropPadMode::NO_DROP_PAD,
+                DropPadMode::DROP_PAD, dropPadMode);
+        return ge::GRAPH_FAILED;
+    }
+
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrDropPadMode.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrExpertTokenNumType(const gert::RuntimeAttrs *attrs, gert::InferShapeContext* context,
+                                                         int64_t &experTokenNumType)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckexperTokenNumType.");
+    const int64_t *experTokenNumTypePtr =
+        attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_TYPE);
+    if (nullptr == experTokenNumTypePtr) {
+        OPS_LOG_E(context->GetNodeName(), "The expert_token_num_type should not be none.");
+        return ge::GRAPH_FAILED;
+    }
+    experTokenNumType = *experTokenNumTypePtr;
+    if (experTokenNumType < ExpertTokenNumType::CUMSUM || experTokenNumType > ExpertTokenNumType::KEY_VALUE) {
+        OPS_LOG_E(context->GetNodeName(), "The expert_token_num_type should be %d, %d or %d. But it is %ld.",
+                  ExpertTokenNumType::CUMSUM, ExpertTokenNumType::COUNT, ExpertTokenNumType::KEY_VALUE,
+                  experTokenNumType);
+        return ge::GRAPH_FAILED;
+    }
+
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertTokenNumType.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrExpertTokenNumFlag(const gert::RuntimeAttrs *attrs,
+                                                         gert::InferShapeContext *context, bool &experTokenNumFlag)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckexperTokenNumType.");
+    const bool *experTokenNumFlagPtr = attrs->GetAttrPointer<bool>(MOE_INIT_ROUTING_CUSTOM_ATTR_EXPERT_TOKEN_NUM_FLAG);
+    if (nullptr == experTokenNumFlagPtr) {
+        OPS_LOG_E(context->GetNodeName(), "The expert_token_num_flag should not be none.");
+        return ge::GRAPH_FAILED;
+    }
+    experTokenNumFlag = *experTokenNumFlagPtr;
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrExpertTokenNumType.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrQuantMode(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
+                                                int64_t &quantMode)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckQuantMode.");
+    if (nullptr == attrs) {
+        OPS_LOG_E(context->GetNodeName(), "The RuntimeAttrs for quant_mode is none.");
+        return ge::GRAPH_FAILED;
+    }
+    const int64_t *quantModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_QUANT_MODE);
+    if (nullptr == quantModePtr) {
+        OPS_LOG_E(context->GetNodeName(), "The quant_mode should be %d, %d or %d. But it is none.", QuantMode::NON_QUANT,
+                QuantMode::STATIC_QUANT, QuantMode::DYNAMIC_QUANT);
+        return ge::GRAPH_FAILED;
+    }
+    quantMode = *quantModePtr;
+    if (quantMode < QuantMode::NON_QUANT || quantMode > QuantMode::DYNAMIC_QUANT) {
+        OPS_LOG_E(context->GetNodeName(), "The quant_mode should be %d, %d or %d. But it is %ld.", QuantMode::NON_QUANT,
+                QuantMode::STATIC_QUANT, QuantMode::DYNAMIC_QUANT, quantMode);
+        return ge::GRAPH_FAILED;
+    }
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckQuantMode.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus GetAndCheckAttrRowIdxType(const gert::RuntimeAttrs *attrs, gert::InferShapeContext *context,
+                                                 int64_t &rowIdxType, int64_t &dropPadMode)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do GetAndCheckAttrRowIdxType.");
+    if (nullptr == attrs) {
+        OPS_LOG_E(context->GetNodeName(), "The RuntimeAttrs for row_Idx_type is none.");
+        return ge::GRAPH_FAILED;
+    }
+    const int64_t *dropPadModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_DROP_PAD_MODE);
+    dropPadMode = *dropPadModePtr;
+
+    const int64_t *rowIdxTypePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_ROW_IDX_TYPE);
+    if (nullptr == rowIdxTypePtr) {
+        OPS_LOG_E(context->GetNodeName(), "The row_Idx_type should be 0 or 1. But it is none.");
+        return ge::GRAPH_FAILED;
+    }
+    rowIdxType = *rowIdxTypePtr;
+    if (dropPadMode == DropPadMode::DROP_PAD && rowIdxType != 0) {
+    	OPS_LOG_E(context->GetNodeName(), "The row_Idx_type should be 0 when dropPadMode is equal to 1 But it is %ld.", rowIdxType);
+        return ge::GRAPH_FAILED;
+    }
+
+    if (rowIdxType < 0 || rowIdxType > 1) {
+        OPS_LOG_E(context->GetNodeName(), "The row_Idx_type should be 0 or 1 But it is %ld.", rowIdxType);
+        return ge::GRAPH_FAILED;
+    }
+
+    OPS_LOG_D(context->GetNodeName(), "End to do GetAndCheckAttrRowIdxType.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus CheckInputScaleShape(gert::InferShapeContext *context, const gert::Shape *xShape,
+                                            const gert::Shape *scaleShape, const int64_t expertStart,
+                                            const int64_t expertEnd, const int64_t quantMode)
+{
+    // When quant_mode is STATIC_QUANT, scale cannot be none.
+    OP_CHECK((nullptr == scaleShape && QuantMode::STATIC_QUANT == quantMode),
+                OPS_LOG_E(context->GetNodeName(), "The scale cannot be none when quant_mode is %ld.", quantMode),
+                return ge::GRAPH_FAILED);
+
+    // When quant_mode is NON_QUANT or DYNAMIC_QUANT, scale can be none.
+    OP_CHECK((nullptr == scaleShape && (QuantMode::NON_QUANT == quantMode || QuantMode::DYNAMIC_QUANT == quantMode)),
+                OPS_LOG_I(context->GetNodeName(), "When quant_mode is NON_QUANT or DYNAMIC_QUANT, scale can be none."),
+                return ge::GRAPH_SUCCESS);
+
+    if (QuantMode::NON_QUANT == quantMode) {
+        if (scaleShape->GetDimNum() == DIM_ONE) {
+            OP_CHECK(scaleShape->GetDim(0) < 0 && scaleShape->GetDim(0) != NEG_ONE && scaleShape->GetDim(0) != NEG_TWO,
+                     OPS_LOG_E(context->GetNodeName(),
+                     "When quant_mode is %ld and use scale in dynamic graph, The shape of scale should be (-1) or (-2), current shape is (%s).",
+                     quantMode, ops::Shape2String(*scaleShape).c_str()),
+                     return ge::GRAPH_FAILED);
+            OP_CHECK(scaleShape->GetDim(0) > 0 && !isSameDim(scaleShape->GetDim(0), xShape->GetDim(0)),
+                     OPS_LOG_E(context->GetNodeName(),
+                     "When quant_mode is %ld and use scale in static graph, The shape of scale should be (%ld,), current shape is (%s).",
+                     quantMode, xShape->GetDim(0), ops::Shape2String(*scaleShape).c_str()),
+                     return ge::GRAPH_FAILED);
+        } else {
+            OPS_LOG_E(context->GetNodeName(), "When quant_mode is %ld, The dimNum of scale should be 1, current shape is (%ld).", quantMode,
+                      scaleShape->GetDimNum());
+            return ge::GRAPH_FAILED;
+        }
+    } else if (QuantMode::STATIC_QUANT == quantMode) {
+        if (scaleShape->GetDimNum() == DIM_ONE) {
+            OP_CHECK(
+                scaleShape->GetDim(0) != NEG_ONE && scaleShape->GetDim(0) != NEG_TWO &&
+                    !isSameDim(scaleShape->GetDim(0), DIM_ONE),
+                OPS_LOG_E(
+                    context->GetNodeName(),
+                    "When quant_mode is %ld, the shape of scale should be (-1) or (-2) or (1,), current shape is (%s).",
+                    quantMode, ops::Shape2String(*scaleShape).c_str()),
+                return ge::GRAPH_FAILED);
+        } else {
+            OPS_LOG_E(context->GetNodeName(), "When quant_mode is %ld, the dimNum of scale should be (1,), current shape is (%ld).",
+                      quantMode, scaleShape->GetDimNum());
+            return ge::GRAPH_FAILED;
+        }
+    } else if (QuantMode::DYNAMIC_QUANT == quantMode) {
+        int64_t activeExpertRange = expertEnd - expertStart;
+        if (scaleShape->GetDimNum() == DIM_ONE) {
+            OP_CHECK(scaleShape->GetDim(0) != NEG_TWO,
+                     OPS_LOG_E(context->GetNodeName(),
+                     "When quant_mode is %ld and scale dim is 1 in dynamic graph, the first dim of scale should be -2, but "
+                     "its shape is (%ld).",
+                     quantMode, scaleShape->GetDim(0)),
+                     return ge::GRAPH_FAILED);
+        } else if (scaleShape->GetDimNum() == DIM_TWO) {
+            if (scaleShape->GetDim(0) > 0) {
+                OP_CHECK(
+                    !isSameDim(scaleShape->GetDim(0), activeExpertRange) && !isSameDim(scaleShape->GetDim(0), DIM_ONE),
+                    OPS_LOG_E(
+                        context->GetNodeName(),
+                        "When quant_mode is %ld in static graph, the first dim of scale should be 1 or %ld, but its shape is (%ld).",
+                        quantMode, activeExpertRange, scaleShape->GetDim(0)),
+                    return ge::GRAPH_FAILED);
+                OP_CHECK(
+                    !isSameDim(scaleShape->GetDim(1), xShape->GetDim(1)),
+                    OPS_LOG_E(
+                        context->GetNodeName(),
+                        "When quant_mode is %ld in static graph, the second dim of scale should or %ld, but its shape is (%ld).",
+                        quantMode, xShape->GetDim(1), scaleShape->GetDim(0)),
+                    return ge::GRAPH_FAILED);
+            } else {
+                OP_CHECK(
+                    scaleShape->GetDim(0) != NEG_ONE || (scaleShape->GetDim(1) != NEG_ONE && scaleShape->GetDim(1) != xShape->GetDim(1)),
+                    OPS_LOG_E(context->GetNodeName(),
+                            "When quant_mode is %ld and scale dim is 2 in dynamic graph, the shape of scale should be (-1, -1) or (-1, %d), but its shape is (%s).",
+                            quantMode, xShape->GetDim(1), ops::Shape2String(*scaleShape).c_str()),
+                    return ge::GRAPH_FAILED);
+            }
+        } else {
+            OPS_LOG_E(
+                context->GetNodeName(),
+                "When quant_mode is %ld, the dimNum of scale should be 1(dynamic graph) or 2, but its shape is (%ld).",
+                scaleShape->GetDimNum());
+            return ge::GRAPH_FAILED;
+        }
+    }
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus CheckInputOffsetShape(gert::InferShapeContext *context, 
+                                             const gert::Shape *offsetShape, const int64_t expertStart,
+                                             const int64_t expertEnd, const int64_t quantMode)
+{
+    // The shape of offset can be none.
+    if (quantMode != QuantMode::STATIC_QUANT) {
+        return ge::GRAPH_SUCCESS;
+    } else if (nullptr == offsetShape) {
+        return ge::GRAPH_FAILED;
+    }
+
+    if (offsetShape->GetDimNum() != DIM_ONE) {
+        OPS_LOG_E(context->GetNodeName(), "The dimNum of offset should be 1, current shape is (%ld).", offsetShape->GetDimNum());
+        return ge::GRAPH_FAILED;
+    }
+    if (offsetShape->GetDim(0) != NEG_ONE && offsetShape->GetDim(0) != NEG_TWO && !isSameDim(offsetShape->GetDim(0), DIM_ONE)) {
+        OPS_LOG_E(context->GetNodeName(),
+                  "The shape of offset should be (1,) in static graph or (-2), (-1,) in dynamic graph, current shape is (%s).",
+                  ops::Shape2String(*offsetShape).c_str());
+        return ge::GRAPH_FAILED;
+    }
+
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus CheckInputShape(gert::InferShapeContext *context, const gert::Shape *xShape,
+                                       const gert::Shape *expertIdxShape, const gert::Shape *scaleShape,
+                                       const gert::Shape *offsetShape, const int64_t expertStart,
+                                       const int64_t expertEnd, const int64_t quantMode)
+{
+    // Check the shape of input_x
+    if (xShape->GetDimNum() == DIM_ONE) {
+        if (xShape->GetDim(0) != ge::UNKNOWN_DIM_NUM) {
+            OPS_LOG_E(context->GetNodeName(), "The dynamic dim of x should be -2, current shape is %s.",
+                      ops::Shape2String(*xShape).c_str());
+            return ge::GRAPH_FAILED;
+        }
+    } else if (xShape->GetDimNum() != DIM_TWO) {
+        OPS_LOG_E(context->GetNodeName(), "The dim of x should be 2 or dynamic, current shape is %s.",
+                  ops::Shape2String(*xShape).c_str());
+        return ge::GRAPH_FAILED;
+    }
+
+    int64_t x_n = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(0);
+    int64_t cols = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(1);
+    if (x_n < NEG_ONE || cols < NEG_ONE) {
+        OPS_LOG_E(context->GetNodeName(), "Invalid x shape, shape is %s.", ops::Shape2String(*xShape).c_str());
+        return ge::GRAPH_FAILED;
+    }
+
+    // Check the shape of expert_idx
+    if (expertIdxShape->GetDimNum() == DIM_ONE) {
+        if (expertIdxShape->GetDim(0) != ge::UNKNOWN_DIM_NUM) {
+            OPS_LOG_E(context->GetNodeName(), "The dynamic dim of expert_idx should be -2, current shape is %s.",
+                      ops::Shape2String(*expertIdxShape).c_str());
+            return ge::GRAPH_FAILED;
+        }
+    } else if (expertIdxShape->GetDimNum() != DIM_TWO) {
+        OPS_LOG_E(context->GetNodeName(), "The dim of expert_idx should be 2 or dynamic, current shape is %s.",
+                  ops::Shape2String(*expertIdxShape).c_str());
+        return ge::GRAPH_FAILED;
+    }
+
+    int64_t expert_idx_n = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(0);
+    int64_t expert_idx_k = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(1);
+    if (expert_idx_n < NEG_ONE || expert_idx_k < NEG_ONE) {
+        OPS_LOG_E(context->GetNodeName(), "Invalid expert_idx shape, shape is %s.",
+                  ops::Shape2String(*expertIdxShape).c_str());
+        return ge::GRAPH_FAILED;
+    }
+
+    if (!isSameDim(x_n, expert_idx_n)) {
+        OPS_LOG_E(context->GetNodeName(), "The first dim of x and expert_idx should be same.");
+        return ge::GRAPH_FAILED;
+    }
+    // Check the shape of scale
+    if (CheckInputScaleShape(context, xShape, scaleShape, expertStart, expertEnd, quantMode) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // Check the shape of offset
+    if (CheckInputOffsetShape(context, offsetShape, expertStart, expertEnd, quantMode) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    return ge::GRAPH_SUCCESS;
+}
+
+static void ShowInputShapeAndAttrInfo(gert::InferShapeContext *context, const gert::Shape *xShape,
+                                      const gert::Shape *expertIdxShape, const gert::Shape *scaleShape,
+                                      const gert::Shape *offsetShape, const int64_t expertStart,
+                                      const int64_t expertEnd, const int64_t quantMode, const int64_t rowIdxType)
+{
+    // input_x and expert_idx are all required.
+    OPS_LOG_D(context->GetNodeName(), "x shape is: %s.", ops::Shape2String(*xShape).c_str());
+    OPS_LOG_D(context->GetNodeName(), "expert_idx shape is: %s.", ops::Shape2String(*expertIdxShape).c_str());
+
+    // scale is optional and can be none.
+    if (nullptr == scaleShape) {
+        OPS_LOG_D(context->GetNodeName(), "scale_shape is: none.");
+    } else {
+        OPS_LOG_D(context->GetNodeName(), "scale_shape is: %s.", ops::Shape2String(*scaleShape).c_str());
+    }
+
+    // offset is optional and can be none.
+    OPS_LOG_D(context->GetNodeName(), "Begin print offset_shape.");
+    if (nullptr == offsetShape) {
+        OPS_LOG_D(context->GetNodeName(), "offset_shape is: none.");
+    } else {
+        OPS_LOG_D(context->GetNodeName(), "offset_shape is: %s.", ops::Shape2String(*offsetShape).c_str());
+    }
+    OPS_LOG_D(context->GetNodeName(), "End print offset_shape.");
+
+    // Attrs are all required.
+    OPS_LOG_D(context->GetNodeName(), "active_expert_range is: [%ld, %ld).", expertStart, expertEnd);
+    OPS_LOG_D(context->GetNodeName(), "quant_mode is: %ld.", quantMode);
+    OPS_LOG_D(context->GetNodeName(), "row_Idx_type is: %ld.", rowIdxType);
+}
+
+static void ShowOutputShapeInfo(gert::InferShapeContext *context, const gert::Shape *expandedXShape,
+                                const gert::Shape *expandedRowIdxShape,
+                                const gert::Shape *expertTokenCumsumOrCountShape, const gert::Shape *expandedScaleShape)
+{
+    OPS_LOG_D(context->GetNodeName(), "expanded_x shape is: %s after infershape.",
+	          ops::Shape2String(*expandedXShape).c_str());
+    OPS_LOG_D(context->GetNodeName(), "expanded_row_idx shape is: %s after infershape.",
+              ops::Shape2String(*expandedRowIdxShape).c_str());
+    OPS_LOG_D(context->GetNodeName(), "expert_token_cumsum_or_count shape is: %s after infershape.",
+              ops::Shape2String(*expertTokenCumsumOrCountShape).c_str());
+    OPS_LOG_D(context->GetNodeName(), "expanded_scale shape is: %s after infershape.",
+              ops::Shape2String(*expandedScaleShape).c_str());
+}
+
+static ge::graphStatus InferShape4MoeInitRoutingCustom(gert::InferShapeContext *context)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do MoeInitRoutingCustomInfershape.");
+    // 1. Get and check input shape
+    // 1.1 Get and check input_x
+    const gert::Shape *xShape = context->GetInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_X);
+    OP_CHECK_NULL_WITH_CONTEXT(context, xShape);
+
+    // 1.2 Get and check expert_idx
+    const gert::Shape *expertIdxShape = context->GetInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_EXPERT_IDX);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expertIdxShape);
+
+    // 1.3 Get scale shape without checking null, because scale is optional and can be none.
+    const gert::Shape *scaleShape = context->GetOptionalInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_SCALE);
+
+    // 1.4 Get offset shape without checking null, because offset is optional and can be none.
+    const gert::Shape *offsetShape = context->GetOptionalInputShape(MOE_INIT_ROUTING_CUSTOM_INPUT_OFFSET);
+    // 2. Get and check attrs
+    const gert::RuntimeAttrs *attrs = context->GetAttrs();
+    OP_CHECK_NULL_WITH_CONTEXT(context, attrs);
+
+    // 2.1 Get and check expert_num attr
+    int64_t experNum = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrExpertNum(attrs, context, experNum) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.2 Get and check active_expert_range attr
+    int64_t expertStart = static_cast<int64_t>(-1);
+    int64_t expertEnd = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrActiveExpertRange(attrs, context, expertStart, expertEnd, experNum) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    if (nullptr == attrs) {
+        OPS_LOG_E(context->GetNodeName(), "The attrs is none.");
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.3 Get and check drop_pad_mode attr
+    int64_t dropPadMode = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrDropPadMode(attrs, context, dropPadMode) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.4 Get and check active_num attr
+    int64_t activeNum = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrActiveNum(attrs, context, activeNum, dropPadMode) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.5 Get and check expert_capacity attr
+    int64_t expertCapacity = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrExpertCapacity(attrs, context, xShape, expertCapacity, dropPadMode) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.6 Get and check expert_token_num_type attr
+    int64_t expertTokenNumType = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrExpertTokenNumType(attrs, context, expertTokenNumType) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.7 Get and check expert_token_num_type attr
+    bool expertTokenNumFlag = false;
+    if (GetAndCheckAttrExpertTokenNumFlag(attrs, context, expertTokenNumFlag) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.8 Get and check quant_mode attr
+    int64_t quantMode = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrQuantMode(attrs, context, quantMode) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 2.9 Get and check row_Idx_type attr
+    int64_t rowIdxType = static_cast<int64_t>(-1);
+    if (GetAndCheckAttrRowIdxType(attrs, context, rowIdxType, dropPadMode) != ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // Check input shape
+    if (CheckInputShape(context, xShape, expertIdxShape, scaleShape, offsetShape, expertStart, expertEnd, quantMode) !=
+        ge::GRAPH_SUCCESS) {
+        return ge::GRAPH_FAILED;
+    }
+
+    // 3. Infer output shape
+    // 3.1 Prepare output shape
+    gert::Shape *expandedXShape = context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expandedXShape);
+    gert::Shape *expandedRowIdxShape = context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expandedRowIdxShape);
+    gert::Shape *expertTokenCumsumOrCountShape =
+        context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expertTokenCumsumOrCountShape);
+    gert::Shape *expandedScaleShape = context->GetOutputShape(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expandedScaleShape);
+
+    int64_t x_n = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(0);
+    int64_t cols = xShape->GetDimNum() == DIM_ONE ? NEG_ONE : xShape->GetDim(1);
+
+    int64_t expert_idx_n = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(0);
+    int64_t k = expertIdxShape->GetDimNum() == DIM_ONE ? NEG_ONE : expertIdxShape->GetDim(1);
+    int64_t n = x_n > expert_idx_n ? x_n : expert_idx_n;
+    if (activeNum == 0 || activeNum == -1) {
+        activeNum = n * k;
+    } else {
+        activeNum = std::min(activeNum, n * k);
+    }
+
+    int64_t xOutDimNum = activeNum < n * k ? activeNum : n * k;
+    int64_t outNum = (n == NEG_ONE || k == NEG_ONE) ? NEG_ONE : n * k;
+    int64_t xOutNum = (n == NEG_ONE || k == NEG_ONE) ? NEG_ONE : xOutDimNum;
+    // 3.2 Set output expanded_x shape
+    if (dropPadMode == DropPadMode::NO_DROP_PAD) {
+        expandedXShape->SetDimNum(DIM_TWO);
+        expandedXShape->SetDim(0U, xOutNum);
+        expandedXShape->SetDim(DIM_ONE, cols);
+    } else {
+        expandedXShape->SetDimNum(DIM_THREE);
+        expandedXShape->SetDim(0U, experNum);
+        expandedXShape->SetDim(DIM_ONE, expertCapacity);
+        expandedXShape->SetDim(DIM_TWO, cols);
+    }
+
+    // 3.3 Set output expanded_row_idx shape
+    expandedRowIdxShape->SetDimNum(DIM_ONE);
+    expandedRowIdxShape->SetDim(0U, outNum);
+
+    // 3.4 Set output expert_token_cumsum_or_count shape
+    if (expertTokenNumFlag) {
+        if (expertTokenNumType == ExpertTokenNumType::KEY_VALUE) {
+            expertTokenCumsumOrCountShape->SetDimNum(DIM_TWO);
+            expertTokenCumsumOrCountShape->SetDim(0U, experNum);
+            expertTokenCumsumOrCountShape->SetDim(DIM_ONE, KEY_VALUE_MODE_DIM0_NUM);
+        } else {
+            expertTokenCumsumOrCountShape->SetDimNum(DIM_ONE);
+            expertTokenCumsumOrCountShape->SetDim(0U, expertEnd - expertStart);
+        }
+    }
+
+    // 3.5 Set output expanded_scale shape
+    // When scale_shape=(b*s) and non-quant, or it is dynamic quant mode, the shape of expanded_scale should be (b*s*k)
+    if (QuantMode::NON_QUANT == quantMode || QuantMode::DYNAMIC_QUANT == quantMode) {
+        expandedScaleShape->SetDimNum(DIM_ONE);
+        if (dropPadMode == DropPadMode::NO_DROP_PAD) {
+            expandedScaleShape->SetDim(0U, xOutNum);
+        } else {
+            expandedScaleShape->SetDim(0U, experNum * expertCapacity);
+        }
+    }
+
+    ShowOutputShapeInfo(context, expandedXShape, expandedRowIdxShape, expertTokenCumsumOrCountShape,
+                        expandedScaleShape);
+    OPS_LOG_D(context->GetNodeName(), "End to do MoeInitRoutingCustomInfershape.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus InferDataType4MoeInitRoutingCustom(gert::InferDataTypeContext *context)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do MoeInitRoutingCustomInferDataType.");
+
+    // Get and check quant_mode attr
+    const gert::RuntimeAttrs *attrs = context->GetAttrs();
+    OP_CHECK_NULL_WITH_CONTEXT(context, attrs);
+    int64_t quantMode = static_cast<int64_t>(-1);
+    const int64_t *quantModePtr = attrs->GetAttrPointer<int64_t>(MOE_INIT_ROUTING_CUSTOM_ATTR_QUANT_MODE);
+    if (nullptr == quantModePtr) {
+        OPS_LOG_E(context->GetNodeName(), "The quant_mode should be %d, %d or %d. But it is none.", QuantMode::NON_QUANT,
+                QuantMode::STATIC_QUANT, QuantMode::DYNAMIC_QUANT);
+        return ge::GRAPH_FAILED;
+    }
+    quantMode = *quantModePtr;
+    // Infer output dtype according quant_mode
+    auto xDtype = context->GetInputDataType(MOE_INIT_ROUTING_CUSTOM_INPUT_X);
+    if (QuantMode::NON_QUANT == quantMode) {
+        context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X, xDtype);
+    } else if (QuantMode::STATIC_QUANT == quantMode || QuantMode::DYNAMIC_QUANT == quantMode) {
+        if (ge::DT_INT8 == xDtype) {
+            OPS_LOG_E(context->GetNodeName(), "When quant_mode=%ld, xDtype cannot be int_8.", quantMode);
+            return ge::GRAPH_FAILED;
+        }
+        context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X, ge::DT_INT8);
+    }
+    context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX, ge::DT_INT32);
+    context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT, ge::DT_INT64);
+    context->SetOutputDataType(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE, ge::DT_FLOAT);
+    OPS_LOG_D(context->GetNodeName(), "End to do MoeInitRoutingCustomInferDataType.");
+    return ge::GRAPH_SUCCESS;
+}
+
+static ge::graphStatus InferShapeRange4MoeInitRoutingCustom(gert::InferShapeRangeContext *context)
+{
+    OPS_LOG_D(context->GetNodeName(), "Begin to do MoeInitRoutingCustomInferRange.");
+
+    // Get and check the pointers of all the outputs' shape range object
+    auto expanded_x = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_X);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expanded_x);
+    auto expanded_row_idx = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_ROW_IDX);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expanded_row_idx);
+    auto count = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPERT_TOKEN_CUMSUM_OR_COUNT);
+    OP_CHECK_NULL_WITH_CONTEXT(context, count);
+    auto expanded_scale = context->GetOutputShapeRange(MOE_INIT_ROUTING_CUSTOM_OUTPUT_EXPANDED_SCALE);
+    OP_CHECK_NULL_WITH_CONTEXT(context, expanded_scale);
+
+    // Print the shape ranges of the outputs before InferShapeRange
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_x->GetMin() = %s",
+              ops::Shape2String(*(expanded_x->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_x->GetMax() = %s",
+              ops::Shape2String(*(expanded_x->GetMax())).c_str());
+
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_row_idx->GetMin() = %s",
+              ops::Shape2String(*(expanded_row_idx->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_row_idx->GetMax() = %s",
+              ops::Shape2String(*(expanded_row_idx->GetMax())).c_str());
+
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, count->GetMin() = %s",
+              ops::Shape2String(*(count->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, count->GetMax() = %s",
+              ops::Shape2String(*(count->GetMax())).c_str());
+
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_scale->GetMin() = %s",
+              ops::Shape2String(*(expanded_scale->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "Before InferShapeRange, expanded_scale->GetMax() = %s",
+              ops::Shape2String(*(expanded_scale->GetMax())).c_str());
+
+    // Set the dim num and dim of the outputs' shape range object
+    if (expanded_x->GetMin() != nullptr && expanded_x->GetMax() != nullptr) {
+        expanded_x->GetMin()->SetDimNum(DIM_TWO);
+        expanded_x->GetMax()->SetDimNum(DIM_TWO);
+        for (size_t i = 0; i < DIM_TWO; i++) {
+            expanded_x->GetMin()->SetDim(i, 0);
+            expanded_x->GetMax()->SetDim(i, -1);
+        }
+    }
+
+    if (expanded_row_idx->GetMin() != nullptr && expanded_row_idx->GetMax() != nullptr) {
+        expanded_row_idx->GetMin()->SetDimNum(DIM_ONE);
+        expanded_row_idx->GetMax()->SetDimNum(DIM_ONE);
+        expanded_row_idx->GetMin()->SetDim(0, 0);
+        expanded_row_idx->GetMax()->SetDim(0, -1);
+    }
+
+    if (count->GetMin() != nullptr && count->GetMax() != nullptr) {
+        count->GetMin()->SetDimNum(DIM_ONE);
+        count->GetMax()->SetDimNum(DIM_ONE);
+        count->GetMin()->SetDim(0, 0);
+        count->GetMax()->SetDim(0, -1);
+    }
+
+    if (expanded_scale->GetMin() != nullptr && expanded_scale->GetMax() != nullptr) {
+        expanded_scale->GetMin()->SetDimNum(DIM_ONE);
+        expanded_scale->GetMax()->SetDimNum(DIM_ONE);
+        expanded_scale->GetMin()->SetDim(0, 0);
+        expanded_scale->GetMax()->SetDim(0, -1);
+    }
+
+    // Print the shape ranges of the outputs after InferShapeRange
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_x->GetMin() = %s",
+              ops::Shape2String(*(expanded_x->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_x->GetMax() = %s",
+              ops::Shape2String(*(expanded_x->GetMax())).c_str());
+
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_row_idx->GetMin() = %s",
+              ops::Shape2String(*(expanded_row_idx->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_row_idx->GetMax() = %s",
+              ops::Shape2String(*(expanded_row_idx->GetMax())).c_str());
+
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, count->GetMin() = %s",
+              ops::Shape2String(*(count->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, count->GetMax() = %s",
+              ops::Shape2String(*(count->GetMax())).c_str());
+
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_scale->GetMin() = %s",
+              ops::Shape2String(*(expanded_scale->GetMin())).c_str());
+    OPS_LOG_D(context->GetNodeName(), "After InferShapeRange, expanded_scale->GetMax() = %s",
+              ops::Shape2String(*(expanded_scale->GetMax())).c_str());
+
+    OPS_LOG_D(context->GetNodeName(), "End to do MoeInitRoutingCustomInferRange.");
+    return ge::GRAPH_SUCCESS;
+}
+
+IMPL_OP_INFERSHAPE(MoeInitRoutingCustom)
+    .InferShape(InferShape4MoeInitRoutingCustom)
+    .InferDataType(InferDataType4MoeInitRoutingCustom)
+    .InferShapeRange(InferShapeRange4MoeInitRoutingCustom);
+} // namespace ops
--- a/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_tiling.cpp
+++ b/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_tiling.cpp
--- a/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_tiling.h
+++ b/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_tiling.h
@@ -0,0 +1,143 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_init_routing_custom_tiling.h
+ * \brief
+ */
+#ifndef AIR_CXX_RUNTIME_V2_OP_IMPL_MOE_INIT_ROUTING_CUSTOM_H
+#define AIR_CXX_RUNTIME_V2_OP_IMPL_MOE_INIT_ROUTING_CUSTOM_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(MoeCustomVBSComputeTilingData)
+TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
+TILING_DATA_FIELD_DEF(int64_t, perCoreElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
+TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops);
+TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, oneLoopMaxElements);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeCustomVBSComputeTilingDataOp, MoeCustomVBSComputeTilingData)
+
+BEGIN_TILING_DATA_DEF(MoeCustomVMSMiddleComputeTilingData)
+TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeCustomVMSMiddleComputeTilingDataOp, MoeCustomVMSMiddleComputeTilingData)
+
+BEGIN_TILING_DATA_DEF(MoeCustomSortOutComputeTilingData)
+TILING_DATA_FIELD_DEF(int64_t, oneLoopMaxElements);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeCustomSortOutComputeTilingDataOp, MoeCustomSortOutComputeTilingData)
+
+BEGIN_TILING_DATA_DEF(MoeCustomExpertTokensCountTilingData)
+TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
+TILING_DATA_FIELD_DEF(int64_t, perCoreElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
+TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops);
+TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopElements);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeCustomExpertTokensCountTilingDataOp, MoeCustomExpertTokensCountTilingData)
+
+BEGIN_TILING_DATA_DEF(MoeCustomGatherOutComputeTilingData)
+TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
+TILING_DATA_FIELD_DEF(int64_t, perCoreIndicesElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreIndicesElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreIndicesLoops);
+TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopIndicesElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopIndicesElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreIndicesLoops);
+TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopIndicesElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopIndicesElements);
+TILING_DATA_FIELD_DEF(int64_t, colsLoops);
+TILING_DATA_FIELD_DEF(int64_t, perLoopCols);
+TILING_DATA_FIELD_DEF(int64_t, lastLoopCols);
+TILING_DATA_FIELD_DEF(int64_t, activeNum);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeCustomGatherOutComputeTilingDataOp, MoeCustomGatherOutComputeTilingData)
+
+BEGIN_TILING_DATA_DEF(MoeCustomSrcToDstCapacityComputeTilingData)
+TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
+TILING_DATA_FIELD_DEF(int64_t, perCoreRows);
+TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopRows);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopRows);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreRows);
+TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopRows);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopRows);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops);
+TILING_DATA_FIELD_DEF(int64_t, perLoopCols);
+TILING_DATA_FIELD_DEF(int64_t, lastLoopCols);
+TILING_DATA_FIELD_DEF(int64_t, colLoops);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeCustomSrcToDstCapacityComputeTilingDataOp, MoeCustomSrcToDstCapacityComputeTilingData)
+
+BEGIN_TILING_DATA_DEF(MoeCustomSrcToDstComputeTilingData)
+TILING_DATA_FIELD_DEF(int64_t, needCoreNum);
+TILING_DATA_FIELD_DEF(int64_t, perCoreElements);
+TILING_DATA_FIELD_DEF(int64_t, perCorePerLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLastLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCorePerLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLastLoopElements);
+TILING_DATA_FIELD_DEF(int64_t, perCoreLoops);
+TILING_DATA_FIELD_DEF(int64_t, lastCoreLoops)
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeCustomSrcToDstComputeTilingDataOp, MoeCustomSrcToDstComputeTilingData)
+
+BEGIN_TILING_DATA_DEF(MoeInitRoutingCustomTilingData)
+TILING_DATA_FIELD_DEF(int64_t, coreNum);
+TILING_DATA_FIELD_DEF(int64_t, n);
+TILING_DATA_FIELD_DEF(int64_t, cols);
+TILING_DATA_FIELD_DEF(int64_t, k);
+TILING_DATA_FIELD_DEF(int64_t, expertStart);
+TILING_DATA_FIELD_DEF(int64_t, expertEnd);
+TILING_DATA_FIELD_DEF(int64_t, actualExpertNum);
+TILING_DATA_FIELD_DEF(int64_t, quantMode);
+TILING_DATA_FIELD_DEF(int64_t, rowIdxType);
+TILING_DATA_FIELD_DEF(int64_t, isInputScale);
+TILING_DATA_FIELD_DEF(int64_t, isInputOffset);
+TILING_DATA_FIELD_DEF(int64_t, expertNum);
+TILING_DATA_FIELD_DEF(int64_t, expertTokensNumType);
+TILING_DATA_FIELD_DEF(int64_t, expertTokensNumFlag);
+TILING_DATA_FIELD_DEF(int64_t, gatherFirstFullload);
+TILING_DATA_FIELD_DEF(int64_t, ep);
+TILING_DATA_FIELD_DEF(int64_t, activeNum);
+TILING_DATA_FIELD_DEF(int64_t, dropPadMode);
+TILING_DATA_FIELD_DEF(int64_t, smoothType);
+TILING_DATA_FIELD_DEF(int64_t, expertCountElements);
+TILING_DATA_FIELD_DEF(int64_t, expertCapacity);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomVBSComputeTilingData, vbsComputeParamsOp);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomVMSMiddleComputeTilingData, vmsMiddleComputeParamsOp);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSortOutComputeTilingData, sortOutComputeParamsOp);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomExpertTokensCountTilingData, expertTokensCountTilingDataOp);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomGatherOutComputeTilingData, gatherOutComputeParamsOp);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSrcToDstCapacityComputeTilingData, srcToDstDropPadParamsOp);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSrcToDstCapacityComputeTilingData, srcToDstDropPadDynamicParamsOp);
+TILING_DATA_FIELD_DEF_STRUCT(MoeCustomSrcToDstComputeTilingData, srcToDstComputeParamsOp);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(MoeInitRoutingCustom, MoeInitRoutingCustomTilingData)
+struct MoeInitRoutingCustomCompileInfo {
+        int32_t aivNum = 0;
+        uint64_t ubSize = 0;
+        platform_ascendc::SocVersion socVersion = platform_ascendc::SocVersion::ASCEND910B;
+  };
+} // namespace optiling
+#endif
--- a/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_tiling_base.cpp
+++ b/csrc/moe_init_routing_custom/op_host/moe_init_routing_custom_tiling_base.cpp
@@ -0,0 +1,68 @@
+/**
+ * This program is free software, you can redistribute it and/or modify.
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_init_routing_custom_tiling_base.cpp
+ * \brief
+ */
+#include "moe_init_routing_custom_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/tiling_templates_registry.h"
+
+#define unlikely(x) __builtin_expect((x), 0)
+
+#define OP_CHECK_NULL_WITH_CONTEXT(context, ptr)                                                           \
+    do {                                                                                                   \
+        if (unlikely((ptr) == nullptr)) {                                                                  \
+            const char* name = (unlikely(((context) == nullptr) || (context)->GetNodeName() == nullptr)) ? \
+                                   "nil" :                                                                 \
+                                   (context)->GetNodeName();                                               \
+            OPS_LOG_E(name, "%s is nullptr!", #ptr);                                                       \
+            return ge::GRAPH_FAILED;                                                                       \
+        }                                                                                                  \
+    } while (0)
+
+namespace optiling {
+static ge::graphStatus TilingForMoeInitRoutingCustom(gert::TilingContext *context)
+{
+    return TilingRegistry::GetInstance().DoTilingImpl(context);
+}
+
+static ge::graphStatus TilingPrepareForMoeInitRountingCustom(gert::TilingParseContext* context)
+{   
+    OPS_LOG_D(context, "TilingPrepareForMoeInitRountingCustom enter.");
+
+    auto compileInfo = context->GetCompiledInfo<MoeInitRoutingCustomCompileInfo>();
+    OP_CHECK_NULL_WITH_CONTEXT(context, compileInfo);
+    auto platformInfo = context->GetPlatformInfo();
+    OP_CHECK_NULL_WITH_CONTEXT(context, platformInfo);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
+    compileInfo->aivNum = ascendcPlatform.GetCoreNumAiv();
+    if (compileInfo->aivNum <= 0) {
+        OPS_LOG_E(context, "TilingPrepareForMoeInitRountingCustom fail to get core num.");
+        return ge::GRAPH_FAILED;
+    }
+
+    uint64_t ubSize;
+    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
+    compileInfo->ubSize = static_cast<int64_t>(ubSize);
+    compileInfo->socVersion = ascendcPlatform.GetSocVersion();
+    if (compileInfo->ubSize <= 0) {
+        OPS_LOG_E(context, "TilingPrepareForMoeInitRountingCustom fail to get ub size.");
+        return ge::GRAPH_FAILED;
+    }
+
+    return ge::GRAPH_SUCCESS;
+}
+
+IMPL_OP_OPTILING(MoeInitRoutingCustom)
+    .Tiling(TilingForMoeInitRoutingCustom)
+    .TilingParse<MoeInitRoutingCustomCompileInfo>(TilingPrepareForMoeInitRountingCustom);
+}  // namespace optiling
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_common.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_common.h
@@ -0,0 +1,110 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_common.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_COMMON_H
+#define MOE_CUSTOM_COMMON_H
+
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+constexpr int64_t SPLIT_N = 0;
+constexpr int64_t SPLIT_K = 1;
+constexpr float MIN_FP32 = -3.4e38f;
+constexpr int64_t FP32_ONE_REPEAT_NUM = 64;
+constexpr int64_t ONE_REPEAT_SORT_NUM = 32;
+constexpr int64_t ONE_REPEAT_COMPARE_NUM = 64;
+constexpr int64_t BLOCK_BYTES = 32;
+constexpr int64_t INT32_ONE_BLOCK_NUM = 8;
+constexpr int64_t FP32_ONE_BLOCK_NUM = 8;
+constexpr int64_t DROPLESS_MODE = 0;
+constexpr int64_t DROP_PAD_MODE = 1;
+constexpr int64_t ASSIST_NUM = 256;
+constexpr int64_t ASSIST_INDEX_NUM = 32;
+constexpr int64_t MRGSORT_LIST_MAX_ELEMENT = 2040;
+constexpr float MAX_INT8 = 127.0f;
+constexpr uint32_t INF = 0xFF7FFFFF;
+
+constexpr int64_t MERGE_LIST_TWO = 2;
+constexpr int64_t MERGE_LIST_THREE = 3;
+constexpr int64_t MERGE_LIST_FOUR = 4;
+
+constexpr int64_t MERGE_LIST_IDX_TWO = 2;
+constexpr int64_t MERGE_LIST_IDX_THREE = 3;
+
+constexpr int64_t GATHER = 0;
+constexpr int64_t SCATTER = 1;
+
+static constexpr int64_t NO_SCALE = 0;
+static constexpr int64_t SCALE_1H = 1;
+static constexpr int64_t SCALE_EH = 2;
+
+constexpr int64_t EXERPT_TOKENS_CUMSUM = 0;
+constexpr int64_t EXERPT_TOKENS_COUNT = 1;
+constexpr int64_t EXERPT_TOKENS_KEY_VALUE = 2;
+constexpr int64_t EXERPT_TOKENS_NONE = 0;
+
+const __gm__ int32_t assist[256] = {
+    0,  0, 0, 0, 0, 0, 0, 0, 1,  0, 0, 0, 0, 0, 0, 0, 2,  0, 0, 0, 0, 0, 0, 0, 3,  0, 0, 0, 0, 0, 0, 0,
+    4,  0, 0, 0, 0, 0, 0, 0, 5,  0, 0, 0, 0, 0, 0, 0, 6,  0, 0, 0, 0, 0, 0, 0, 7,  0, 0, 0, 0, 0, 0, 0,
+    8,  0, 0, 0, 0, 0, 0, 0, 9,  0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0,
+    12, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0,
+    16, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0,
+    20, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0,
+    24, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 0, 0, 0, 0, 0, 26, 0, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0,
+    28, 0, 0, 0, 0, 0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0};
+
+__aicore__ inline int64_t Ceil(int64_t a, int64_t b)
+{
+    if (b == 0) {
+        return 0;
+    }
+    return (a + b - 1) / b;
+}
+
+__aicore__ inline int64_t Align(int64_t elementNum, int64_t bytes)
+{
+    if (bytes == 0) {
+        return 0;
+    }
+    return (elementNum * bytes + BLOCK_BYTES - 1) / BLOCK_BYTES * BLOCK_BYTES / bytes;
+}
+
+__aicore__ inline int64_t AlignBytes(int64_t elementNum, int64_t bytes)
+{
+    return (elementNum * bytes + BLOCK_BYTES - 1) / BLOCK_BYTES * BLOCK_BYTES;
+}
+
+template <typename T>
+__aicore__ inline T Min(T a, T b)
+{
+    return a > b ? b : a;
+}
+
+template <typename T>
+__aicore__ inline T Max(T a, T b)
+{
+    return a < b ? b : a;
+}
+
+template <HardEvent event>
+__aicore__ inline void SetWaitFlag(HardEvent evt)
+{
+    event_t eventId = static_cast<event_t>(GetTPipePtr()->FetchEventID(evt));
+    SetFlag<event>(eventId);
+    WaitFlag<event>(eventId);
+}
+
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_COMMON_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_expert_tokens_count.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_expert_tokens_count.h
@@ -0,0 +1,371 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_expert_tokens_count.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_EXPERT_TOKENS_COUNT_H
+#define MOE_CUSTOM_EXPERT_TOKENS_COUNT_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+constexpr int64_t EXPERT_ID_VALUE_NUM = 2;
+constexpr int64_t CUMSUM_MODE = 0;
+constexpr int64_t COUNT_MODE = 1;
+constexpr int64_t KEY_VALUE_MODE = 2;
+constexpr int64_t KEY_VALUE_MODE_DIM_NUM = 2;
+constexpr int64_t GATHER_SORT_CORE_NUM = 16;
+constexpr int64_t DROP_LESS = 0;
+constexpr int64_t DROP_PAD = 1;
+
+template <const int HISTOGRAMTYPE>
+class ExpertTokensCount {
+public:
+    __aicore__ inline ExpertTokensCount(){};
+    template <bool CALC_ACTUAL_EXPERT_NUM>
+    __aicore__ inline void Init(GM_ADDR expandedRowIdx, GM_ADDR expertTokensCount, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn(int64_t loop, int64_t curLoopElements);
+    __aicore__ inline void Compute(int64_t curLoopElements);
+    __aicore__ inline void CopyOut();
+    __aicore__ inline void CopyOutExpertTotalCount();
+
+    __aicore__ inline void expertCountCopyIn();
+    __aicore__ inline void expertCountCompute();
+    __aicore__ inline void expertCountCopyOut();
+
+private:
+    GlobalTensor<int32_t> sortedexpertIdxGm_;
+    GlobalTensor<int32_t> expertCountTempGm_;
+    GlobalTensor<int64_t> expertTokensCountGm_;
+    GlobalTensor<int32_t> expertTotalCountGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<int32_t> expertIdxValueGm_;
+    TPipe *pipe_;
+
+    TQue<QuePosition::VECIN, 1> sortedExpertIdxInQueue_;
+    TQue<QuePosition::VECOUT, 1> expertCountOutToTempQueue_;
+    TQue<QuePosition::VECIN, 1> expertCountTempInQueue_;
+    TQue<QuePosition::VECOUT, 1> expertIdxCountOutQueue_;
+    TQue<QuePosition::VECOUT, 1> expertTotalCountQueue_;
+
+    const MoeCustomExpertTokensCountTilingData *expertTokensCountTilingData_;
+    int64_t coreNum_;
+    int64_t blockIdx_;
+    int64_t needCoreNum_;
+    int64_t perCoreElements_;
+    int64_t curCoreElements_ = 0;
+    int64_t expertStart_ = 0;
+    int64_t expertEnd_ = 0;
+    int64_t actualExpertNum_ = 0;
+    int64_t coreLoopsNum_ = 0;
+    int64_t perCorePerLoopElements_ = 0;
+    int64_t perCoreLastLoopElements_ = 0;
+    int64_t actualExpertTotalNum_ = 0;
+    int64_t expertNum_ = 0;
+    int64_t expertCountElements_ = 0;
+    bool expertTokensNumFlag_ = false;
+    int64_t dropPadMode_ = 0;
+    int32_t finalExpertId = -1;
+    int32_t expertTokenValue = 0;
+    int64_t ep_ = 0;
+    int64_t rowIdxType_ = 0;
+};
+
+template <const int HISTOGRAMTYPE>
+template <bool CALC_ACTUAL_EXPERT_NUM>
+__aicore__ inline void
+ExpertTokensCount<HISTOGRAMTYPE>::Init(GM_ADDR expandedRowIdx, GM_ADDR expertTokensCount, GM_ADDR workspace,
+                                       const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    coreNum_ = tilingData->coreNum;
+    pipe_ = tPipe;
+    expertTokensCountTilingData_ = &(tilingData->expertTokensCountTilingDataOp);
+    blockIdx_ = GetBlockIdx();
+    needCoreNum_ = expertTokensCountTilingData_->needCoreNum;
+    perCoreElements_ = expertTokensCountTilingData_->perCoreElements;
+    expertStart_ = tilingData->expertStart;
+    expertEnd_ = tilingData->expertEnd;
+    actualExpertNum_ = tilingData->actualExpertNum;
+    expertNum_ = tilingData->expertNum;
+    expertTokensNumFlag_ = tilingData->expertTokensNumFlag;
+    dropPadMode_ = tilingData->dropPadMode;
+    ep_ = tilingData->ep;
+    rowIdxType_ = tilingData->rowIdxType;
+
+    if (blockIdx_ == needCoreNum_ - 1) {
+        curCoreElements_ = expertTokensCountTilingData_->lastCoreElements;
+        coreLoopsNum_ = expertTokensCountTilingData_->lastCoreLoops;
+        perCorePerLoopElements_ = expertTokensCountTilingData_->lastCorePerLoopElements;
+        perCoreLastLoopElements_ = expertTokensCountTilingData_->lastCoreLastLoopElements;
+    } else {
+        curCoreElements_ = expertTokensCountTilingData_->perCoreElements;
+        coreLoopsNum_ = expertTokensCountTilingData_->perCoreLoops;
+        perCorePerLoopElements_ = expertTokensCountTilingData_->perCorePerLoopElements;
+        perCoreLastLoopElements_ = expertTokensCountTilingData_->perCoreLastLoopElements;
+    }
+
+    if (CALC_ACTUAL_EXPERT_NUM) {
+        // key and value
+        int64_t kvFactor = 2;
+        GlobalTensor<int32_t> sortedNumGm;
+        sortedNumGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                    Align(tilingData->n * tilingData->k, sizeof(int32_t)) * kvFactor * kvFactor);
+        int32_t totalSortedNum = 0;
+        for (int32_t i = 0; i < 16; i++) {
+            totalSortedNum += sortedNumGm.GetValue(i);
+        }
+        perCoreElements_ = Ceil(totalSortedNum, GetBlockNum());
+        needCoreNum_ = Ceil(totalSortedNum, perCoreElements_);
+        int64_t lastCoreElements = totalSortedNum - (needCoreNum_ - 1) * perCoreElements_;
+        if (blockIdx_ == needCoreNum_ - 1) {
+            curCoreElements_ = lastCoreElements;
+        } else {
+            curCoreElements_ = perCoreElements_;
+        }
+        coreLoopsNum_ = Ceil(curCoreElements_, expertTokensCountTilingData_->perCorePerLoopElements);
+        perCorePerLoopElements_ = Ceil(curCoreElements_, coreLoopsNum_);
+        perCoreLastLoopElements_ = curCoreElements_ - (coreLoopsNum_ - 1) * perCorePerLoopElements_;
+    }
+
+    if constexpr (HISTOGRAMTYPE == KEY_VALUE_MODE) {
+        expertCountElements_ = ((actualExpertNum_ + 1) < expertNum_) ? (actualExpertNum_ + 1) * KEY_VALUE_MODE_DIM_NUM :
+                                                                       expertNum_ * KEY_VALUE_MODE_DIM_NUM;
+    } else {
+        expertCountElements_ = actualExpertNum_;
+    }
+    sortedexpertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + blockIdx_ * perCoreElements_, curCoreElements_);
+    expertTokensCountGm_.SetGlobalBuffer((__gm__ int64_t *)expertTokensCount, expertCountElements_);
+    expertCountTempGm_.SetGlobalBuffer(
+        (__gm__ int32_t *)workspace + Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2, actualExpertNum_);
+    expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                            Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2 +
+                                            Align(actualExpertNum_, sizeof(int32_t)),
+                                        actualExpertNum_);
+    expertIdxValueGm_.SetGlobalBuffer(
+        (__gm__ int32_t *)workspace + Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2 +
+            Align((actualExpertNum_), sizeof(int32_t)) + Align((actualExpertNum_), sizeof(int32_t)),
+        coreNum_ * 2);
+    expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreElements_,
+                                      curCoreElements_);
+
+    if ((tilingData->rowIdxType == GATHER) && (blockIdx_ < needCoreNum_)) {
+        InitGlobalMemory(expandedRowIdxGm_, curCoreElements_, -1);
+        SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+    }
+    int64_t sortedExpertIdxInLen = Max(perCorePerLoopElements_, perCoreLastLoopElements_);
+
+    pipe_->InitBuffer(sortedExpertIdxInQueue_, 1, AlignBytes(sortedExpertIdxInLen, sizeof(int32_t)));
+    pipe_->InitBuffer(expertCountOutToTempQueue_, 1, AlignBytes(actualExpertNum_, sizeof(int32_t)));
+    pipe_->InitBuffer(expertCountTempInQueue_, 1, AlignBytes(actualExpertNum_, sizeof(int32_t)));
+
+    pipe_->InitBuffer(expertIdxCountOutQueue_, 1, AlignBytes(expertCountElements_, sizeof(int64_t)));
+    pipe_->InitBuffer(expertTotalCountQueue_, 1, AlignBytes(1, sizeof(int32_t)));
+
+    if (blockIdx_ == 0) {
+        InitGlobalMemory(expertTotalCountGm_, 1, 0);
+        SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+    }
+    SyncAll();
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::Process()
+{
+    if (blockIdx_ < needCoreNum_) {
+        for (int64_t i = 0; i < coreLoopsNum_; i++) {
+            int64_t perLoopElements = (i == (coreLoopsNum_ - 1)) ? perCoreLastLoopElements_ : perCorePerLoopElements_;
+            CopyIn(i, perLoopElements);
+            Compute(perLoopElements);
+            CopyOut();
+        }
+        if (ep_ == 1) {
+            CopyOutExpertTotalCount();
+        }
+    }
+    if (ep_ == 1 || expertTokensNumFlag_ || dropPadMode_ == 1) {
+        SyncAll();
+    }
+    /* copy expert tokens count result from worksapce to output GM. */
+    if (blockIdx_ == 0 && expertTokensNumFlag_) {
+        expertCountCopyIn();
+        expertCountCompute();
+        expertCountCopyOut();
+    }
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::CopyIn(int64_t loop, int64_t curLoopElements)
+{
+    LocalTensor<int32_t> sortedExpertIdxInLocal = sortedExpertIdxInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopElements * sizeof(int32_t)),
+                                     0, 0, 0};
+    DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
+    int64_t sortedexpertIdxOffset = loop * perCorePerLoopElements_;
+    DataCopyPad(sortedExpertIdxInLocal, sortedexpertIdxGm_[sortedexpertIdxOffset], dataCopyParams, dataCopyPadParams);
+    sortedExpertIdxInQueue_.EnQue(sortedExpertIdxInLocal);
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::Compute(int64_t curLoopElements)
+{
+    LocalTensor<int32_t> sortedExpertIdxInLocal = sortedExpertIdxInQueue_.DeQue<int32_t>();
+    LocalTensor<int32_t> expertCountOutLocal = expertCountOutToTempQueue_.AllocTensor<int32_t>();
+    Duplicate(expertCountOutLocal.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
+              static_cast<int32_t>(actualExpertNum_));
+    SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
+    int64_t i = 0;
+    int32_t lastExpertId = sortedExpertIdxInLocal.GetValue(0);
+    int32_t lastIndex = 0;
+    int64_t loopTokenCount = 0;
+    int32_t lastlastExpertId = lastExpertId;
+    for (i = 1; i < curLoopElements; i++) {
+        if ((lastExpertId >= expertEnd_) || (lastExpertId < expertStart_)) {
+            break;
+        }
+        int32_t curExpertId = sortedExpertIdxInLocal.GetValue(i);
+        if (curExpertId != lastExpertId || curExpertId >= expertEnd_) {
+            if constexpr (HISTOGRAMTYPE == COUNT_MODE || HISTOGRAMTYPE == KEY_VALUE_MODE) {
+                expertCountOutLocal.SetValue(lastExpertId - expertStart_, i - lastIndex);
+                loopTokenCount += i - lastIndex;
+            } else {
+                for (int64_t j = lastlastExpertId; j < lastExpertId; j++) {
+                    expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
+                }
+                loopTokenCount += i - lastIndex;
+                expertCountOutLocal.SetValue(lastExpertId - expertStart_, loopTokenCount);
+            }
+            lastIndex = i;
+            lastlastExpertId = lastExpertId;
+            lastExpertId = curExpertId;
+        }
+    }
+    if ((i == curLoopElements) && ((lastExpertId >= expertStart_) && (lastExpertId < expertEnd_))) {
+        if constexpr (HISTOGRAMTYPE == COUNT_MODE || HISTOGRAMTYPE == KEY_VALUE_MODE) {
+            expertCountOutLocal.SetValue(lastExpertId - expertStart_, i - lastIndex);
+            loopTokenCount += i - lastIndex;
+        } else {
+            for (int64_t j = lastlastExpertId; j < lastExpertId; j++) {
+                expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
+            }
+            loopTokenCount += i - lastIndex;
+            expertCountOutLocal.SetValue(lastExpertId - expertStart_, loopTokenCount);
+            for (int64_t j = lastExpertId; j < expertEnd_; j++) {
+                expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
+            }
+        }
+    } else {
+        if constexpr (HISTOGRAMTYPE == EXERPT_TOKENS_CUMSUM) {
+            for (int64_t j = lastlastExpertId; j < expertEnd_; j++) {
+                expertCountOutLocal.SetValue(j - expertStart_, loopTokenCount);
+            }
+        }
+    }
+    actualExpertTotalNum_ += loopTokenCount;
+    finalExpertId = lastExpertId;
+    expertTokenValue = (i - lastIndex);
+
+    expertCountOutToTempQueue_.EnQue<int32_t>(expertCountOutLocal);
+    sortedExpertIdxInQueue_.FreeTensor(sortedExpertIdxInLocal);
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::CopyOutExpertTotalCount()
+{
+    LocalTensor<int32_t> expertTotalCountLocal = expertTotalCountQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams copyTotalCountParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+    expertTotalCountLocal.SetValue(0, static_cast<int32_t>(actualExpertTotalNum_));
+    SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+    SetAtomicAdd<int32_t>();
+    DataCopyPad(expertTotalCountGm_, expertTotalCountLocal, copyTotalCountParams);
+    SetAtomicNone();
+    expertTotalCountQueue_.FreeTensor(expertTotalCountLocal);
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::CopyOut()
+{
+    LocalTensor<int32_t> expertCountOutLocal = expertCountOutToTempQueue_.DeQue<int32_t>();
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>((actualExpertNum_) * sizeof(int32_t)),
+                                 0, 0, 0};
+    SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+    SetAtomicAdd<int32_t>();
+    DataCopyPad(expertCountTempGm_, expertCountOutLocal, copyParams);
+    SetAtomicNone();
+
+    if (dropPadMode_ == DROP_PAD) {
+        expertCountOutLocal.SetValue(0, finalExpertId);
+        expertCountOutLocal.SetValue(1, expertTokenValue);
+        DataCopyExtParams copyParams{static_cast<uint16_t>(1),
+                                     static_cast<uint32_t>(EXPERT_ID_VALUE_NUM * sizeof(int32_t)), 0, 0, 0};
+        SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+        DataCopyPad(expertIdxValueGm_[blockIdx_ * EXPERT_ID_VALUE_NUM], expertCountOutLocal, copyParams);
+    }
+    expertCountOutToTempQueue_.FreeTensor(expertCountOutLocal);
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::expertCountCopyIn()
+{
+    LocalTensor<int32_t> expertCountTempInLocal = expertCountTempInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
+                                     static_cast<uint32_t>((actualExpertNum_) * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(expertCountTempInLocal, expertCountTempGm_, dataCopyParams, dataCopyPadParams);
+    expertCountTempInQueue_.EnQue(expertCountTempInLocal);
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::expertCountCompute()
+{
+    LocalTensor<int32_t> expertCountTempInLocal = expertCountTempInQueue_.DeQue<int32_t>();
+    LocalTensor<int64_t> expertCountOutLocal = expertIdxCountOutQueue_.AllocTensor<int64_t>();
+    if constexpr (HISTOGRAMTYPE == KEY_VALUE_MODE) {
+        int64_t expertOffset = 0;
+        Duplicate(expertCountOutLocal.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
+                  static_cast<int32_t>(expertCountElements_ * KEY_VALUE_MODE));
+        SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
+        for (int64_t i = 0; i < actualExpertNum_; i++) {
+            int64_t expertCount = static_cast<int64_t>(expertCountTempInLocal.GetValue(i));
+            if (expertCount != 0) {
+                expertCountOutLocal.SetValue(expertOffset * KEY_VALUE_MODE_DIM_NUM, i + expertStart_);
+                expertCountOutLocal.SetValue(expertOffset * KEY_VALUE_MODE_DIM_NUM + 1, expertCount);
+                expertOffset++;
+            }
+        }
+    } else {
+        Cast(expertCountOutLocal, expertCountTempInLocal, RoundMode::CAST_NONE, actualExpertNum_);
+    }
+
+    expertIdxCountOutQueue_.EnQue<int64_t>(expertCountOutLocal);
+    expertCountTempInQueue_.FreeTensor(expertCountTempInLocal);
+}
+
+template <const int HISTOGRAMTYPE>
+__aicore__ inline void ExpertTokensCount<HISTOGRAMTYPE>::expertCountCopyOut()
+{
+    LocalTensor<int64_t> expertCountOutLocal = expertIdxCountOutQueue_.DeQue<int64_t>();
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1),
+                                 static_cast<uint32_t>(expertCountElements_ * sizeof(int64_t)), 0, 0, 0};
+    DataCopyPad(expertTokensCountGm_, expertCountOutLocal, copyParams);
+    copyParams.blockLen = sizeof(int32_t);
+    expertIdxCountOutQueue_.FreeTensor(expertCountOutLocal);
+}
+
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_EXPERT_TOKENS_COUNT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load.h
@@ -0,0 +1,280 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_full_load.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_FULL_LOAD_H
+#define MOE_CUSTOM_FULL_LOAD_H
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+class MoeCustomFullLoad {
+public:
+    __aicore__ inline MoeCustomFullLoad(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedX,
+                                GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void SortCompute();
+    __aicore__ inline void ExpertCountCompute();
+    __aicore__ inline void CopyOutDynamicQuant();
+
+private:
+    int64_t sortNum;
+
+    TPipe *pipe;
+    TQue<QuePosition::VECIN, 1> sortDataCopyInQueue;
+    TQue<QuePosition::VECOUT, 1> sortDataCopyOutQueue;
+    TQue<QuePosition::VECOUT, 1> expertTokensCountOrCumsumOutQueue;
+    TQue<QuePosition::VECIN, 1> smoothInQueue;
+    TQue<QuePosition::VECIN, 1> inputXInQueue;
+    TQue<QuePosition::VECOUT, 1> inputXOutQueue;
+    TQue<QuePosition::VECOUT, 1> scaleOutQueue;
+    TQue<QuePosition::VECOUT, 1> rowIdxOutQueue;
+
+    TBuf<TPosition::VECCALC> tempBuffer;
+    TBuf<TPosition::VECCALC> sortedBuffer;
+    TBuf<TPosition::VECCALC> quantTempBuffer;
+
+    GlobalTensor<bfloat16_t> inputXGm;
+    GlobalTensor<float> smoothGm;
+    GlobalTensor<int8_t> expandedXGm;
+    GlobalTensor<float> expandedScaleGm;
+    GlobalTensor<int32_t> expertIdxGm;
+    GlobalTensor<int32_t> expendedRowIdxGm;
+    GlobalTensor<int32_t> sortedExpertForSourceRowGm;
+    GlobalTensor<int32_t> expandDstToSrcRowGm;
+    GlobalTensor<int32_t> sortedexpertIdxGm;
+    GlobalTensor<int32_t> expertCountTempGm;
+    GlobalTensor<int32_t> expandedRowIdxGm;
+    GlobalTensor<int64_t> expertTokensCountOrCumsumGm;
+
+    int64_t blockIdx = 0;
+    int64_t tileLength;
+    int64_t bufferNum = 1;
+    int64_t totalLength;
+    int64_t n;
+    int64_t k;
+    int64_t cols_;
+    int64_t expertNum_ = 256;
+    int64_t rowIdxType_;
+    int64_t kvFactor = 2;
+    static constexpr int64_t DST_BLK_STRIDE = 1;
+    static constexpr int64_t DST_REP_STRIDE = 8;
+};
+
+__aicore__ inline void MoeCustomFullLoad::CopyIn()
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
+                                     static_cast<uint32_t>(this->totalLength * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(inLocal[0], expertIdxGm, dataCopyParams, dataCopyPadParams);
+    LocalTensor<int32_t> rowIdxLocal = inLocal[this->sortNum];
+    ArithProgression<int32_t>(rowIdxLocal, 0, 1, this->sortNum);
+    sortDataCopyInQueue.EnQue(inLocal);
+}
+
+__aicore__ inline void MoeCustomFullLoad::SortCompute()
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue.DeQue<int32_t>();
+    LocalTensor<int32_t> expertIdx = inLocal[0];
+    LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
+    Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, this->tileLength);
+    Muls(expertIdxFp32, expertIdxFp32, (float)-1, this->tileLength);
+    int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = this->totalLength - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(expertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+    }
+
+    LocalTensor<float> concatLocal;
+    LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
+    Concat(concatLocal, expertIdxFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
+
+    LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
+    LocalTensor<uint32_t> sourceRowLocal;
+    sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
+    Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
+
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
+    LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
+    LocalTensor<uint32_t> expandDstToSrcRowLocal;
+    expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
+    Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
+    Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
+
+    LocalTensor<int32_t> expertForSourceRowLocalInt32;
+    expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
+    Cast(expertForSourceRowLocalInt32, sortedExpertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
+    sortDataCopyOutQueue.EnQue<float>(outLocal);
+    sortDataCopyInQueue.FreeTensor(inLocal);
+}
+
+__aicore__ inline void MoeCustomFullLoad::ExpertCountCompute()
+{
+    LocalTensor<int32_t> outLocal = sortDataCopyOutQueue.DeQue<int32_t>();
+    LocalTensor<int32_t> sortedExpertId = outLocal;
+    LocalTensor<int64_t> expertTokensLocalTensor = expertTokensCountOrCumsumOutQueue.AllocTensor<int64_t>();
+
+    int64_t i = 0;
+    int32_t lastExpertId = sortedExpertId.GetValue(0);
+    int32_t lastIndex = 0;
+    int64_t index = 0;
+    for (i = 1; i < this->totalLength; i++) {
+        int32_t curExpertId = sortedExpertId.GetValue(i);
+        if (curExpertId != lastExpertId) {
+            expertTokensLocalTensor.SetValue(index * kvFactor, lastExpertId);
+            expertTokensLocalTensor.SetValue(index * kvFactor + 1, i - lastIndex);
+            index++;
+            lastIndex = i;
+            lastExpertId = curExpertId;
+        }
+    }
+    if (i == this->totalLength) {
+        expertTokensLocalTensor.SetValue(index * kvFactor, lastExpertId);
+        expertTokensLocalTensor.SetValue(index * kvFactor + 1, i - lastIndex);
+        index++;
+    }
+    // totalLength < 256
+    expertTokensLocalTensor.SetValue(index * kvFactor, 0);
+    expertTokensLocalTensor.SetValue(index * kvFactor + 1, 0);
+    SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+
+    expertTokensCountOrCumsumOutQueue.EnQue<int64_t>(expertTokensLocalTensor);
+    sortDataCopyOutQueue.EnQue<int32_t>(outLocal);
+}
+
+__aicore__ inline void MoeCustomFullLoad::CopyOutDynamicQuant()
+{
+    LocalTensor<int64_t> expertTokensLocalTensor = expertTokensCountOrCumsumOutQueue.DeQue<int64_t>();
+    DataCopyParams intriParams;
+    intriParams.blockCount = 1;
+    intriParams.blockLen = expertNum_ * sizeof(int64_t);
+    DataCopyPad(expertTokensCountOrCumsumGm, expertTokensLocalTensor, intriParams);
+    expertTokensCountOrCumsumOutQueue.FreeTensor(expertTokensLocalTensor);
+    LocalTensor<int32_t> outLocal = sortDataCopyOutQueue.DeQue<int32_t>();
+
+    int64_t expertIdx = outLocal.GetValue(blockIdx);
+    LocalTensor<bfloat16_t> xInLocal = inputXInQueue.AllocTensor<bfloat16_t>();
+    LocalTensor<int8_t> xOutLocal = inputXOutQueue.AllocTensor<int8_t>();
+    LocalTensor<float> smoothLocal = smoothInQueue.AllocTensor<float>();
+    LocalTensor<float> scaleLocal = scaleOutQueue.AllocTensor<float>();
+    LocalTensor<float> tempLocal = quantTempBuffer.Get<float>();
+    DataCopyExtParams copyInParams{1, static_cast<uint32_t>(cols_ * sizeof(bfloat16_t)), 0, 0, 0};
+    DataCopyExtParams smoothParams{1, static_cast<uint32_t>(cols_ * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(cols_ * sizeof(int8_t)), 0, 0, 0};
+    DataCopyPad(xInLocal, inputXGm, copyInParams, {false, 0, 0, 0});
+    DataCopyPad(smoothLocal, smoothGm[expertIdx * cols_], smoothParams, {false, 0, 0, 0});
+    smoothInQueue.EnQue<float>(smoothLocal);
+    smoothLocal = smoothInQueue.DeQue<float>();
+    Cast(tempLocal, xInLocal, RoundMode::CAST_NONE, cols_);
+    Mul(smoothLocal, tempLocal, smoothLocal, cols_);
+    // compute scale
+    Abs(tempLocal, smoothLocal, cols_);
+    ReduceMax(scaleLocal, tempLocal, tempLocal, cols_);
+    float scaleValue = scaleLocal.GetValue(0) / 127.0f;
+    Duplicate<float>(scaleLocal, scaleValue, DST_REP_STRIDE);
+    Duplicate<float>(tempLocal, scaleValue, cols_);
+    // compute quant
+    Div(tempLocal, smoothLocal, tempLocal, cols_);
+    Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_ODD, cols_);  // fp32->fp16
+    Cast(xOutLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_RINT, cols_); // fp16->int8
+    inputXOutQueue.EnQue<int8_t>(xOutLocal);
+    xOutLocal = inputXOutQueue.DeQue<int8_t>();
+    scaleOutQueue.EnQue<float>(scaleLocal);
+    scaleLocal = scaleOutQueue.DeQue<float>();
+    DataCopyPad(expandedXGm[blockIdx * cols_], xOutLocal, copyOutParams);
+    DataCopyPad(expandedScaleGm[blockIdx], scaleLocal, {1, 4, 0, 0, 0});
+    smoothInQueue.FreeTensor(smoothLocal);
+    inputXInQueue.FreeTensor(xInLocal);
+    inputXOutQueue.FreeTensor(xOutLocal);
+    scaleOutQueue.FreeTensor(scaleLocal);
+
+    if (blockIdx == 0) {
+        intriParams.blockLen = this->totalLength * sizeof(int32_t);
+        if (rowIdxType_ == 1) {
+            DataCopyPad(expandedRowIdxGm, outLocal[this->sortNum], intriParams);
+        } else if (rowIdxType_ == 0) {
+            LocalTensor rowIdxLocalTensor = rowIdxOutQueue.AllocTensor<int32_t>();
+            for (int i = 0; i < this->totalLength; i++) {
+                int32_t dstIdx = outLocal[this->sortNum].GetValue(i);
+                rowIdxLocalTensor.SetValue(dstIdx, i);
+            }
+            SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+            DataCopyPad(expandedRowIdxGm, rowIdxLocalTensor, intriParams);
+            rowIdxOutQueue.FreeTensor(rowIdxLocalTensor);
+        }
+    }
+    sortDataCopyOutQueue.FreeTensor(outLocal);
+}
+
+__aicore__ inline void MoeCustomFullLoad::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset,
+                                           GM_ADDR expandedX, GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
+                                           GM_ADDR expandedScale, const MoeInitRoutingCustomTilingData *tilingData,
+                                           TPipe *tPipe)
+{
+    this->pipe = tPipe;
+    this->blockIdx = GetBlockIdx();
+    this->n = tilingData->n;
+    this->k = tilingData->k;
+    this->tileLength = Align(tilingData->vbsComputeParamsOp.lastCorePerLoopElements, sizeof(int32_t));
+    this->sortNum = Ceil(this->tileLength, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+    this->totalLength = tilingData->n * tilingData->k;
+    cols_ = tilingData->cols;
+    rowIdxType_ = tilingData->rowIdxType;
+
+    expertIdxGm.SetGlobalBuffer((__gm__ int32_t *)expertIdx, this->tileLength);
+
+    expandedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, this->tileLength);
+    expertTokensCountOrCumsumGm.SetGlobalBuffer((__gm__ int64_t *)expertTokensCountOrCumsum, this->tileLength);
+
+    inputXGm.SetGlobalBuffer((__gm__ bfloat16_t *)x, this->n * cols_);
+    smoothGm.SetGlobalBuffer((__gm__ float *)scale, expertNum_ * cols_);
+    expandedXGm.SetGlobalBuffer((__gm__ int8_t *)expandedX, this->n * cols_ * this->k);
+    expandedScaleGm.SetGlobalBuffer((__gm__ float *)expandedScale, this->n * this->k);
+
+    // key and value
+    int64_t buffSize = this->sortNum * sizeof(int32_t) * kvFactor;
+    pipe->InitBuffer(sortDataCopyInQueue, bufferNum, buffSize);
+    pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, buffSize);
+    pipe->InitBuffer(tempBuffer, buffSize);
+    pipe->InitBuffer(sortedBuffer, buffSize);
+    pipe->InitBuffer(expertTokensCountOrCumsumOutQueue, bufferNum, Align(expertNum_ * kvFactor, sizeof(int32_t)));
+
+    pipe->InitBuffer(smoothInQueue, bufferNum, AlignBytes(cols_, sizeof(float)));
+    pipe->InitBuffer(inputXInQueue, bufferNum, AlignBytes(cols_, sizeof(bfloat16_t)));
+    pipe->InitBuffer(inputXOutQueue, bufferNum, AlignBytes(cols_, sizeof(int8_t)));
+    pipe->InitBuffer(quantTempBuffer, AlignBytes(cols_, sizeof(float)));
+    pipe->InitBuffer(scaleOutQueue, bufferNum, AlignBytes(1, sizeof(float)));
+    pipe->InitBuffer(rowIdxOutQueue, bufferNum, AlignBytes(this->totalLength, sizeof(int32_t)));
+}
+
+__aicore__ inline void MoeCustomFullLoad::Process()
+{
+    if (this->blockIdx < GetBlockNum()) {
+        CopyIn();
+        SortCompute();
+        ExpertCountCompute();
+        CopyOutDynamicQuant();
+    }
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_FULL_LOAD_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_base.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_base.h
@@ -0,0 +1,512 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_base_full_load.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_FULL_LOAD_BASE_H
+#define MOE_CUSTOM_FULL_LOAD_BASE_H
+
+#include "moe_custom_common.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+template <typename T>
+class MoeCustomFullLoadBase {
+public:
+    __aicore__ inline MoeCustomFullLoadBase(){};
+    __aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
+                                GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+
+protected:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void Compute();
+    __aicore__ inline void TilingInKernel();
+    __aicore__ inline void SortComputeWithRange();
+    __aicore__ inline void SortCompute();
+    __aicore__ inline void CopyOutIdx();
+    __aicore__ inline void CopyOutDefaultGatherIdx();
+    __aicore__ inline void CopyOutDefaultTokenCountOrCumsum();
+    __aicore__ inline void ComputeExpertTokenCountOrCumsum();
+
+protected:
+    int64_t sortNum_;
+    const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
+    int64_t blockIdx_;
+    int64_t needCoreNum_;
+    int64_t coreIndicesElements_;
+    int64_t perCoreIndicesElements_;
+    int64_t k_;
+    int64_t n_;
+    int64_t cols_;
+    int64_t dropPadMode_;
+    int64_t activeNum_;
+    int64_t expertNum_;
+    int64_t expertStart_ = 0;
+    int64_t expertEnd_ = 0;
+    int64_t bufferNum_ = 1;
+    int64_t kvFactor_ = 2;
+    int64_t totalLength_;
+    int64_t tileLength_;
+    int64_t expertTokensNumType_ = 0;
+    int64_t expertTokensNumFlag_ = 0;
+    uint64_t actual_idx_num_ = 0;
+    int64_t ep_ = 0;
+    int64_t gatherFirstFullload_ = 0;
+    int64_t isInputScale_ = 0;
+    int64_t rowIdxType_ = 0;
+    int64_t actualExpertNum_ = 0;
+    int64_t expertCountElements_ = 0;
+    int64_t curIndexStart_;
+    int64_t startXRow_;
+    int64_t endXRow_;
+    int64_t quantMode_ = -1;
+
+    static constexpr int64_t DST_BLK_STRIDE = 1;
+    static constexpr int64_t DST_REP_STRIDE = 8;
+    static constexpr int64_t MASK_STRIDE = 64;
+
+    TQue<QuePosition::VECOUT, 1> expandedRowIdxCopyOutQueue_;
+    TQue<QuePosition::VECOUT, 1> expandedExpertIdxCopyOutQueue_;
+    TQue<QuePosition::VECOUT, 1> expandDstToSrcRowQueue_;
+    TQue<QuePosition::VECOUT, 1> expertTokensCopyOutQueue_;
+    TQue<QuePosition::VECOUT, 1> sortDataCopyInQueue_;
+
+    TBuf<TPosition::VECCALC> tempBuffer_;
+    TBuf<TPosition::VECCALC> sortedBuffer_;
+
+    GlobalTensor<int32_t> expertIdxGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<int64_t> expertTokensCountOrCumsumGm_;
+
+    TPipe *pipe_;
+};
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::Init(GM_ADDR expertIdx, GM_ADDR expandedRowIdx,
+                                                      GM_ADDR expertTokensCountOrCumsum, GM_ADDR workspace,
+                                                      const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    this->gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
+    this->blockIdx_ = GetBlockIdx();
+    this->n_ = tilingData->n;
+    this->k_ = tilingData->k;
+    this->cols_ = tilingData->cols;
+    this->expertStart_ = tilingData->expertStart;
+    this->expertEnd_ = tilingData->expertEnd;
+    this->needCoreNum_ = this->gatherOutTilingData_->needCoreNum;
+
+    this->perCoreIndicesElements_ = this->gatherOutTilingData_->perCoreIndicesElements;
+    this->dropPadMode_ = tilingData->dropPadMode;
+    this->activeNum_ = tilingData->activeNum;
+    this->quantMode_ = tilingData->quantMode;
+    if (this->blockIdx_ == this->gatherOutTilingData_->needCoreNum - 1) {
+        this->coreIndicesElements_ = this->gatherOutTilingData_->lastCoreIndicesElements;
+    } else {
+        this->coreIndicesElements_ = this->gatherOutTilingData_->perCoreIndicesElements;
+    }
+    this->expertTokensNumType_ = tilingData->expertTokensNumType;
+    this->expertTokensNumFlag_ = tilingData->expertTokensNumFlag;
+    this->expertNum_ = tilingData->expertNum;
+    this->totalLength_ = tilingData->n * tilingData->k;
+    this->ep_ = tilingData->ep;
+    this->gatherFirstFullload_ = tilingData->gatherFirstFullload;
+    this->isInputScale_ = tilingData->isInputScale;
+    this->tileLength_ = Align(tilingData->vbsComputeParamsOp.lastCorePerLoopElements, sizeof(int32_t));
+    this->sortNum_ = Ceil(this->tileLength_, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+    this->actual_idx_num_ = this->totalLength_;
+    this->rowIdxType_ = tilingData->rowIdxType;
+    this->actualExpertNum_ = tilingData->actualExpertNum;
+    this->pipe_ = tPipe;
+
+    expertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expertIdx, this->tileLength_);
+    expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, this->tileLength_);
+    if (this->expertTokensNumFlag_ > 0) {
+        expertTokensCountOrCumsumGm_.SetGlobalBuffer((__gm__ int64_t *)expertTokensCountOrCumsum);
+    }
+
+    if (expertTokensNumType_ == EXERPT_TOKENS_KEY_VALUE) {
+        expertCountElements_ = expertNum_ * EXERPT_TOKENS_KEY_VALUE;
+    } else {
+        expertCountElements_ = actualExpertNum_;
+    }
+    int64_t buffSize = this->sortNum_ * sizeof(int32_t);
+
+    curIndexStart_ = this->blockIdx_ * this->perCoreIndicesElements_;
+    startXRow_ = curIndexStart_ / this->k_;
+    endXRow_ = (curIndexStart_ + this->coreIndicesElements_ - 1) / this->k_;
+
+    pipe_->InitBuffer(expandedExpertIdxCopyOutQueue_, bufferNum_, buffSize);
+    pipe_->InitBuffer(expertTokensCopyOutQueue_, bufferNum_, AlignBytes(expertCountElements_, sizeof(int64_t)));
+    pipe_->InitBuffer(expandDstToSrcRowQueue_, bufferNum_, buffSize);
+    pipe_->InitBuffer(expandedRowIdxCopyOutQueue_, bufferNum_, buffSize);
+    pipe_->InitBuffer(sortDataCopyInQueue_, bufferNum_, buffSize * kvFactor_);
+    pipe_->InitBuffer(tempBuffer_, buffSize * kvFactor_);
+    pipe_->InitBuffer(sortedBuffer_, buffSize * kvFactor_);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::CopyIn()
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0,
+                                     0, 0};
+    DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(inLocal[0], expertIdxGm_, dataCopyParams, dataCopyPadParams);
+    ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, totalLength_);
+    sortDataCopyInQueue_.EnQue(inLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::Compute()
+{
+    if (ep_) {
+        SortComputeWithRange();
+    } else {
+        SortCompute();
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::SortComputeWithRange()
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue_.DeQue<int32_t>();
+    LocalTensor<int32_t> expertIdxLocal = inLocal[0];
+    LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
+    LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
+    Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, totalLength_);
+    PipeBarrier<PIPE_V>();
+    Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, totalLength_);
+    PipeBarrier<PIPE_V>();
+    if (gatherFirstFullload_) {
+        int64_t maskOffset = AlignBytes(Ceil(totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE, sizeof(int8_t));
+        LocalTensor<uint8_t> compareScalarMaskLocalTensor0 = tempBuffer_.Get<uint8_t>()[maskOffset];
+        LocalTensor<uint8_t> compareScalarMaskLocalTensor1 = tempBuffer_.Get<uint8_t>()[maskOffset * kvFactor_];
+        LocalTensor<uint8_t> gatherMaskLocalTensor = tempBuffer_.Get<uint8_t>();
+
+        // Find elements >= expertStart_, which means -elements <= -expertStart_
+        AscendC::CompareScalar(
+            compareScalarMaskLocalTensor0, expertIdxLocalFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::LE,
+            (totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+        PipeBarrier<PIPE_V>();
+
+        // Find elements < expertEnd_, which means -elements > -expertEnd_
+        AscendC::CompareScalar(
+            compareScalarMaskLocalTensor1, expertIdxLocalFp32, static_cast<float>(-expertEnd_), AscendC::CMPMODE::GT,
+            (totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+        PipeBarrier<PIPE_V>();
+
+        And(gatherMaskLocalTensor.ReinterpretCast<uint16_t>(),
+            compareScalarMaskLocalTensor0.ReinterpretCast<uint16_t>(),
+            compareScalarMaskLocalTensor1.ReinterpretCast<uint16_t>(),
+            Ceil(totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE / kvFactor_);
+        PipeBarrier<PIPE_V>();
+
+        uint64_t rsvdCnt = 0;
+        GatherMaskParams gatherMaskParams;
+        gatherMaskParams.repeatTimes = 1;
+        gatherMaskParams.src0BlockStride = 1;
+        gatherMaskParams.src0RepeatStride = DST_REP_STRIDE;
+        gatherMaskParams.src1RepeatStride = DST_REP_STRIDE;
+        GatherMask(expertIdxLocalFp32, expertIdxLocalFp32, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
+                   static_cast<uint32_t>(totalLength_), gatherMaskParams, rsvdCnt);
+        PipeBarrier<PIPE_V>();
+        actual_idx_num_ = rsvdCnt;
+        sortNum_ = Ceil(actual_idx_num_, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+
+        GatherMask(rowIdxLocal, rowIdxLocal, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
+                   static_cast<uint32_t>(totalLength_), gatherMaskParams, actual_idx_num_);
+        PipeBarrier<PIPE_V>();
+        TilingInKernel();
+    } else {
+        LocalTensor<uint8_t> maskLocalTensor = tempBuffer_.Get<uint8_t>();
+        AscendC::CompareScalar(
+            maskLocalTensor, expertIdxLocalFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::GT,
+            (totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+        LocalTensor<float> floatMinLocalTensor = sortedBuffer_.Get<float>();
+        Duplicate(floatMinLocalTensor, MIN_FP32, totalLength_);
+        PipeBarrier<PIPE_V>();
+        Select(expertIdxLocalFp32, maskLocalTensor, floatMinLocalTensor, expertIdxLocalFp32,
+               SELMODE::VSEL_TENSOR_TENSOR_MODE, totalLength_);
+        PipeBarrier<PIPE_V>();
+    }
+    // handle actual_idx_num_ == 0
+    if (actual_idx_num_ < 1) {
+        sortDataCopyInQueue_.FreeTensor(inLocal);
+        return;
+    }
+    int64_t duplicateNum = actual_idx_num_ % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = actual_idx_num_ - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> (FP32_ONE_REPEAT_NUM - ONE_REPEAT_SORT_NUM));
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+        PipeBarrier<PIPE_V>();
+    }
+
+    LocalTensor<float> concatLocal = expertIdxLocalFp32;
+    LocalTensor<float> tempTensor = tempBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
+    Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<float> sortedLocal = sortedBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
+    Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
+    LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
+    Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+
+    Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, actual_idx_num_);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<int32_t> expandedExpertIdxLocalInt32;
+    expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
+    Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, actual_idx_num_);
+    PipeBarrier<PIPE_V>();
+    expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
+    expandDstToSrcRowQueue_.EnQue<uint32_t>(expandDstToSrcRowLocal);
+    sortDataCopyInQueue_.FreeTensor(inLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::SortCompute()
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue_.DeQue<int32_t>();
+    LocalTensor<int32_t> expertIdxLocal = inLocal[0];
+    LocalTensor<float> expertIdxLocalFp32 = expertIdxLocal.ReinterpretCast<float>();
+    Cast(expertIdxLocalFp32, expertIdxLocal, RoundMode::CAST_ROUND, totalLength_);
+    PipeBarrier<PIPE_V>();
+    Muls(expertIdxLocalFp32, expertIdxLocalFp32, (float)-1, totalLength_);
+    PipeBarrier<PIPE_V>();
+    int64_t duplicateNum = totalLength_ % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = totalLength_ - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> (FP32_ONE_REPEAT_NUM - ONE_REPEAT_SORT_NUM));
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(expertIdxLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+        PipeBarrier<PIPE_V>();
+    }
+    LocalTensor<float> concatLocal = expertIdxLocalFp32;
+    LocalTensor<float> tempTensor = tempBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
+    Concat(concatLocal, expertIdxLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<uint32_t> rowIdxLocal = inLocal[this->sortNum_].template ReinterpretCast<uint32_t>();
+    LocalTensor<float> sortedLocal = sortedBuffer_.Get<float>(GetSortLen<float>(this->sortNum_));
+    Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<float> expandedExpertIdxLocal = expandedExpertIdxCopyOutQueue_.AllocTensor<float>();
+    LocalTensor<uint32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.AllocTensor<uint32_t>();
+    LocalTensor<float> expandDstToSrcRowLocalFp32 = expandDstToSrcRowLocal.ReinterpretCast<float>();
+    Extract(expandedExpertIdxLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<uint32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<uint32_t>();
+    Muls(expandedExpertIdxLocal, expandedExpertIdxLocal, (float)-1, totalLength_);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<int32_t> expandedExpertIdxLocalInt32;
+    expandedExpertIdxLocalInt32 = expandedExpertIdxLocal.ReinterpretCast<int32_t>();
+    Cast(expandedExpertIdxLocalInt32, expandedExpertIdxLocal, RoundMode::CAST_ROUND, totalLength_);
+    PipeBarrier<PIPE_V>();
+
+    Cast(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
+         totalLength_);
+    PipeBarrier<PIPE_V>();
+    Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, totalLength_);
+    PipeBarrier<PIPE_V>();
+    ArithProgression<int32_t>(inLocal[this->sortNum_], 0, 1, totalLength_);
+    PipeBarrier<PIPE_V>();
+    if (duplicateNum > 0) {
+        int duplicateIndex = totalLength_ - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> (FP32_ONE_REPEAT_NUM - ONE_REPEAT_SORT_NUM));
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(expandDstToSrcRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+        PipeBarrier<PIPE_V>();
+    }
+    Concat(concatLocal, expandDstToSrcRowLocalFp32, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+    Sort<float, true>(sortedLocal, concatLocal, rowIdxLocal, tempTensor, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+    Extract(tempTensor, expandedRowIdx, sortedLocal, this->sortNum_ / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+
+    if (rowIdxType_ == SCATTER or quantMode_ == 1) {
+        Muls(expandDstToSrcRowLocalFp32, expandDstToSrcRowLocalFp32, (float)-1, totalLength_);
+        PipeBarrier<PIPE_V>();
+        Cast(expandDstToSrcRowLocal.ReinterpretCast<int32_t>(), expandDstToSrcRowLocalFp32, RoundMode::CAST_RINT,
+             totalLength_);
+    }
+    expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdxLocalInt32);
+    expandedRowIdxCopyOutQueue_.EnQue<uint32_t>(expandedRowIdx);
+    expandDstToSrcRowQueue_.EnQue<uint32_t>(expandDstToSrcRowLocal);
+    sortDataCopyInQueue_.FreeTensor(inLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::CopyOutDefaultGatherIdx()
+{
+    LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<int32_t>();
+    Duplicate(expandedRowIdx, static_cast<int32_t>(-1), static_cast<int32_t>(totalLength_));
+    SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0, 0,
+                                 0};
+    DataCopyPad(expandedRowIdxGm_, expandedRowIdx, copyParams);
+    expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::CopyOutDefaultTokenCountOrCumsum()
+{
+    LocalTensor<int64_t> expertTokensOut = expertTokensCopyOutQueue_.AllocTensor<int64_t>();
+    Duplicate(expertTokensOut.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
+              static_cast<int32_t>(expertCountElements_ * EXERPT_TOKENS_KEY_VALUE));
+    SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1),
+                                 static_cast<uint32_t>(expertCountElements_ * sizeof(int64_t)), 0, 0, 0};
+    DataCopyPad(expertTokensCountOrCumsumGm_, expertTokensOut, copyParams);
+    expertTokensCopyOutQueue_.FreeTensor(expertTokensOut);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::CopyOutIdx()
+{
+    LocalTensor<int32_t> expandedExpertIdx = expandedExpertIdxCopyOutQueue_.DeQue<int32_t>();
+    LocalTensor<int32_t> expandDstToSrcRowLocal = expandDstToSrcRowQueue_.DeQue<int32_t>();
+    if (rowIdxType_ == SCATTER) {
+        DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(actual_idx_num_ * sizeof(int32_t)),
+                                     0, 0, 0};
+        DataCopyPad(expandedRowIdxGm_, expandDstToSrcRowLocal, copyParams);
+    } else if (ep_) {
+        LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.AllocTensor<int32_t>();
+        Duplicate(expandedRowIdx, static_cast<int32_t>(-1), static_cast<int32_t>(totalLength_));
+        SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
+        for (int64_t i = 0; i < actual_idx_num_; i++) {
+            int32_t curExpertId = expandedExpertIdx.GetValue(i);
+            if (curExpertId < expertStart_ || curExpertId >= expertEnd_) {
+                break;
+            }
+            int64_t outIndices = expandDstToSrcRowLocal.GetValue(i);
+            expandedRowIdx.SetValue(outIndices, i);
+        }
+        SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+        DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0,
+                                     0, 0};
+        DataCopyPad(expandedRowIdxGm_, expandedRowIdx, copyParams);
+        expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
+    } else {
+        LocalTensor<int32_t> expandedRowIdx = expandedRowIdxCopyOutQueue_.DeQue<int32_t>();
+        DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(totalLength_ * sizeof(int32_t)), 0,
+                                     0, 0};
+        DataCopyPad(expandedRowIdxGm_, expandedRowIdx, copyParams);
+        expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx);
+    }
+    expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdx);
+    expandDstToSrcRowQueue_.EnQue<int32_t>(expandDstToSrcRowLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::ComputeExpertTokenCountOrCumsum()
+{
+    // compute
+    LocalTensor<int32_t> expandedExpertIdx = expandedExpertIdxCopyOutQueue_.DeQue<int32_t>();
+    LocalTensor<int64_t> expertTokensOut = expertTokensCopyOutQueue_.AllocTensor<int64_t>();
+    Duplicate(expertTokensOut.ReinterpretCast<int32_t>(), static_cast<int32_t>(0),
+              static_cast<int32_t>(expertCountElements_ * EXERPT_TOKENS_KEY_VALUE));
+    SetWaitFlag<HardEvent::V_S>(HardEvent::V_S);
+    int64_t i = 0;
+    int32_t lastExpertId = expandedExpertIdx.GetValue(0);
+    int32_t lastLastId = lastExpertId;
+    int64_t tokenCount = 0;
+    int64_t lastIndex = 0;
+    int64_t Offset = 0;
+    for (i = 1; i < actual_idx_num_; i++) {
+        if ((lastExpertId >= expertEnd_) || (lastExpertId < expertStart_)) {
+            break;
+        }
+        int32_t curExpertId = expandedExpertIdx.GetValue(i);
+        if (curExpertId != lastExpertId || curExpertId >= expertEnd_) {
+            int64_t expertOffset = lastExpertId - expertStart_;
+            if (expertTokensNumType_ == EXERPT_TOKENS_KEY_VALUE) {
+                expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE, lastExpertId);
+                expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE + 1, i - lastIndex);
+                Offset += 1;
+            } else if (expertTokensNumType_ == EXERPT_TOKENS_COUNT) {
+                expertTokensOut.SetValue(expertOffset, i - lastIndex);
+            } else {
+                for (int64_t j = lastLastId; j < lastExpertId; j++) {
+                    expertTokensOut.SetValue(j - expertStart_, tokenCount);
+                }
+                tokenCount += i - lastIndex;
+                expertTokensOut.SetValue(expertOffset, tokenCount);
+            }
+            lastIndex = i;
+            lastLastId = lastExpertId;
+            lastExpertId = curExpertId;
+        }
+    }
+    if ((i == actual_idx_num_) && ((lastExpertId >= expertStart_) && (lastExpertId < expertEnd_))) {
+        int64_t expertOffset = lastExpertId - expertStart_;
+        if (expertTokensNumType_ == EXERPT_TOKENS_KEY_VALUE) {
+            expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE, lastExpertId);
+            expertTokensOut.SetValue(Offset * EXERPT_TOKENS_KEY_VALUE + 1, i - lastIndex);
+        } else if (expertTokensNumType_ == EXERPT_TOKENS_COUNT) {
+            expertTokensOut.SetValue(expertOffset, i - lastIndex);
+        } else {
+            for (int64_t j = lastLastId; j < lastExpertId; j++) {
+                expertTokensOut.SetValue(j - expertStart_, tokenCount);
+            }
+            tokenCount += i - lastIndex;
+            expertTokensOut.SetValue(expertOffset, tokenCount);
+            for (int64_t j = lastExpertId; j < expertEnd_; j++) {
+                expertTokensOut.SetValue(j - expertStart_, tokenCount);
+            }
+        }
+    } else {
+        if (expertTokensNumType_ == EXERPT_TOKENS_CUMSUM) {
+            for (int64_t j = lastLastId; j < expertEnd_; j++) {
+                expertTokensOut.SetValue(j - expertStart_, tokenCount);
+            }
+        }
+    }
+    expandedExpertIdxCopyOutQueue_.EnQue<int32_t>(expandedExpertIdx);
+    SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1),
+                                 static_cast<uint32_t>(expertCountElements_ * sizeof(int64_t)), 0, 0, 0};
+    DataCopyPad(expertTokensCountOrCumsumGm_, expertTokensOut, copyParams);
+    SetWaitFlag<HardEvent::MTE3_V>(HardEvent::MTE3_V);
+    expertTokensCopyOutQueue_.FreeTensor(expertTokensOut);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadBase<T>::TilingInKernel()
+{
+    int64_t coreNum = needCoreNum_;
+    perCoreIndicesElements_ = Ceil(actual_idx_num_, coreNum);
+    needCoreNum_ = Ceil(actual_idx_num_, perCoreIndicesElements_);
+    int64_t lastCoreIndicesElements = actual_idx_num_ - (needCoreNum_ - 1) * perCoreIndicesElements_;
+    if (blockIdx_ == needCoreNum_ - 1) {
+        coreIndicesElements_ = lastCoreIndicesElements;
+    } else {
+        coreIndicesElements_ = perCoreIndicesElements_;
+    }
+    curIndexStart_ = this->blockIdx_ * this->perCoreIndicesElements_;
+    startXRow_ = curIndexStart_ / this->k_;
+    endXRow_ = (curIndexStart_ + this->coreIndicesElements_ - 1) / this->k_;
+}
+
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_FULL_LOAD_BASE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_dynamic_quant.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_dynamic_quant.h
@@ -0,0 +1,300 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_full_load_dynamic_quant.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_FULL_LOAD_DYNAMIC_QUANT_H
+#define MOE_CUSTOM_FULL_LOAD_DYNAMIC_QUANT_H
+
+#include "moe_custom_full_load_base.h"
+#include "moe_custom_common.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
+class MoeCustomFullLoadDynamicQuant : public MoeCustomFullLoadBase<T> {
+public:
+    __aicore__ inline MoeCustomFullLoadDynamicQuant(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expandedRowIdx,
+                                GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyOutXDynamicQuantFromGather();
+    __aicore__ inline void CopyOutXDynamicQuantFromScatter();
+    __aicore__ inline void FreeLocalTensor();
+    __aicore__ inline void ComputeQuant(LocalTensor<float> &smoothLocal);
+
+private:
+    TQue<QuePosition::VECIN, 1> xCopyInQueue_;
+    TQue<QuePosition::VECIN, 1> smoothInQueue_;
+    TBuf<TPosition::VECCALC> tmpBuff_;
+    TQue<QuePosition::VECOUT, 1> inputXOutQueue_;
+    TQue<QuePosition::VECOUT, 1> scaleOutQueue_;
+
+    GlobalTensor<T> xGm_;
+    GlobalTensor<int8_t> expandedXGm_;
+    GlobalTensor<float> quantSmoothGm_;
+    GlobalTensor<float> expandedScaleGm_;
+
+    int64_t colsAlign_ = 0;
+};
+
+template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
+__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::Init(
+    GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expandedRowIdx,
+    GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
+    const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    MoeCustomFullLoadBase<T>::Init(expertIdx, expandedRowIdx, expertTokensCountOrCumsum, workspace, tilingData, tPipe);
+
+    xGm_.SetGlobalBuffer((__gm__ T *)x);
+    expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
+    quantSmoothGm_.SetGlobalBuffer((__gm__ float *)scale);
+    expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
+    this->colsAlign_ = Align(this->cols_, sizeof(T));
+    if constexpr (IsSameType<T, float>::value) {
+        this->pipe_->InitBuffer(xCopyInQueue_, 1, AlignBytes(this->cols_, sizeof(float)));
+    } else {
+        this->pipe_->InitBuffer(xCopyInQueue_, 1, 2 * AlignBytes(this->cols_, sizeof(T)));
+    }
+    this->pipe_->InitBuffer(inputXOutQueue_, 1, AlignBytes(this->cols_, sizeof(int8_t)));
+    this->pipe_->InitBuffer(smoothInQueue_, 1, AlignBytes(this->cols_, sizeof(float)));
+    this->pipe_->InitBuffer(tmpBuff_, AlignBytes(this->cols_, sizeof(float)));
+    this->pipe_->InitBuffer(scaleOutQueue_, 1, BLOCK_BYTES + BLOCK_BYTES);
+}
+
+template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
+__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::Process()
+{
+    if (this->blockIdx_ < this->needCoreNum_) {
+        this->CopyIn();
+        this->Compute();
+
+        // vaild expert equal zero
+        if (this->needCoreNum_ < 1) {
+            if (this->blockIdx_ == 0) {
+                if (this->rowIdxType_ == GATHER) {
+                    this->CopyOutDefaultGatherIdx();
+                }
+                if (this->expertTokensNumFlag_ == 1) {
+                    this->CopyOutDefaultTokenCountOrCumsum();
+                }
+            }
+            return;
+        }
+
+        if (this->blockIdx_ == 0) {
+            this->CopyOutIdx();
+        }
+
+        if (this->blockIdx_ == this->needCoreNum_ - 1 && this->expertTokensNumFlag_ == 1) {
+            this->ComputeExpertTokenCountOrCumsum();
+        }
+
+        if (this->blockIdx_ < this->needCoreNum_) {
+            if constexpr (!COPYOUTTYPE && SMOOTHTYPE != SCALE_EH) {
+                CopyOutXDynamicQuantFromGather();
+            } else {
+                CopyOutXDynamicQuantFromScatter();
+            }
+        }
+
+        FreeLocalTensor();
+    }
+}
+
+template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
+__aicore__ inline void
+MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::ComputeQuant(LocalTensor<float> &smoothLocal)
+{
+    LocalTensor<float> tempLocal = tmpBuff_.Get<float>();
+    LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
+    LocalTensor<float> dynamicQuantLocal = scaleOutQueue_.AllocTensor<float>();
+    LocalTensor<float> inLocal = xCopyInQueue_.DeQue<float>();
+
+    if constexpr (!IsSameType<T, float>::value && !IsSameType<T, int8_t>::value) {
+        Cast(inLocal, inLocal.ReinterpretCast<T>()[colsAlign_], RoundMode::CAST_NONE, this->cols_);
+        PipeBarrier<PIPE_V>();
+    }
+
+    if constexpr (SMOOTHTYPE != NO_SCALE) {
+        Mul(inLocal, inLocal, smoothLocal, this->cols_);
+        PipeBarrier<PIPE_V>();
+    }
+
+    Abs(tempLocal, inLocal, this->cols_);
+    PipeBarrier<PIPE_V>();
+
+    ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols_);
+    PipeBarrier<PIPE_V>();
+
+    float maxValue = dynamicQuantLocal.GetValue(0) / MAX_INT8;
+
+    Duplicate<float>(dynamicQuantLocal, maxValue, INT32_ONE_BLOCK_NUM);
+    PipeBarrier<PIPE_V>();
+    Duplicate<float>(tempLocal, maxValue, this->cols_);
+    PipeBarrier<PIPE_V>();
+
+    Div(tempLocal, inLocal, tempLocal, this->cols_);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<int32_t> intLocal = tempLocal.ReinterpretCast<int32_t>();
+    Cast(intLocal, tempLocal, RoundMode::CAST_RINT, this->cols_);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    Cast(intLocal.ReinterpretCast<half>(), intLocal, RoundMode::CAST_ROUND, this->cols_);
+    PipeBarrier<PIPE_V>();
+    Cast(outLocal, intLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, this->cols_);
+
+    inputXOutQueue_.EnQue<int8_t>(outLocal);
+    scaleOutQueue_.EnQue<float>(dynamicQuantLocal);
+}
+
+template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
+__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::CopyOutXDynamicQuantFromScatter()
+{
+    LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
+    LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
+
+    DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
+    DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
+    ;
+
+    if constexpr (SMOOTHTYPE == SCALE_1H) {
+        DataCopyPad(smoothLocal, quantSmoothGm_, smoothCopyParams, {false, 0, 0, 0});
+        smoothInQueue_.EnQue(smoothLocal);
+        smoothLocal = smoothInQueue_.DeQue<float>();
+    }
+
+    int64_t dstIndexStart = this->curIndexStart_;
+    int64_t dstIndexEnd = dstIndexStart + this->coreIndicesElements_ - 1;
+    int32_t lastExpertIdx = -1;
+
+    for (int64_t dstIndex = dstIndexStart; dstIndex <= dstIndexEnd; dstIndex++) {
+        if (this->dropPadMode_ == DROPLESS_MODE && dstIndex >= this->activeNum_) {
+            break;
+        }
+        int32_t srcIdx = sortedRowIdx.GetValue(dstIndex);
+        int32_t expertIdx = expandedExpertIdx.GetValue(dstIndex);
+        if (expertIdx < this->expertStart_ || expertIdx >= this->expertEnd_) {
+            break;
+        }
+        expertIdx = expertIdx - this->expertStart_;
+        LocalTensor<T> xLocal = this->xCopyInQueue_.template AllocTensor<T>();
+        // copy in single x
+        if constexpr (IsSameType<T, float>::value) {
+            DataCopyPad(xLocal, this->xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
+        } else {
+            DataCopyPad(xLocal[colsAlign_], this->xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams,
+                        {false, 0, 0, 0});
+        }
+        xCopyInQueue_.EnQue<T>(xLocal);
+
+        // copyin dynamic scale
+        if constexpr (SMOOTHTYPE == SCALE_EH) {
+            if (expertIdx != lastExpertIdx) {
+                DataCopyPad(smoothLocal, quantSmoothGm_[expertIdx * this->cols_], smoothCopyParams, {false, 0, 0, 0});
+                smoothInQueue_.EnQue(smoothLocal);
+                smoothLocal = smoothInQueue_.DeQue<float>();
+                lastExpertIdx = expertIdx;
+            }
+        }
+
+        ComputeQuant(smoothLocal);
+
+        LocalTensor<float> quantScaleLocal = scaleOutQueue_.DeQue<float>();
+        DataCopyPad(expandedScaleGm_[dstIndex], quantScaleLocal, quantScaleParams);
+
+        LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
+        DataCopyPad(this->expandedXGm_[dstIndex * this->cols_], outLocal, intriParams);
+
+        inputXOutQueue_.FreeTensor(outLocal);
+        scaleOutQueue_.FreeTensor(quantScaleLocal);
+        this->xCopyInQueue_.FreeTensor(xLocal);
+    }
+    smoothInQueue_.FreeTensor(smoothLocal);
+    this->expandDstToSrcRowQueue_.EnQue(sortedRowIdx);
+    this->expandedExpertIdxCopyOutQueue_.EnQue(expandedExpertIdx);
+}
+
+template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
+__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::CopyOutXDynamicQuantFromGather()
+{
+    DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
+    DataCopyExtParams smoothCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
+    LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
+    int64_t curIndex = this->blockIdx_ * this->perCoreIndicesElements_;
+    int64_t curIndexEnd = curIndex + this->coreIndicesElements_ - 1;
+
+    if constexpr (SMOOTHTYPE == SCALE_1H) {
+        DataCopyPad(smoothLocal, quantSmoothGm_, smoothCopyParams, {false, 0, 0, 0});
+        smoothInQueue_.EnQue(smoothLocal);
+        smoothLocal = smoothInQueue_.DeQue<float>();
+    }
+
+    for (int64_t row = this->startXRow_; row <= this->endXRow_; row++) {
+        LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
+        if constexpr (IsSameType<T, float>::value) {
+            DataCopyPad(xLocal, this->xGm_[row * this->cols_], dataXCopyParams, {false, 0, 0, 0});
+        } else {
+            DataCopyPad(xLocal[colsAlign_], this->xGm_[row * this->cols_], dataXCopyParams, {false, 0, 0, 0});
+        }
+        xCopyInQueue_.EnQue<T>(xLocal);
+        ComputeQuant(smoothLocal);
+
+        LocalTensor<float> quantScaleLocal = scaleOutQueue_.DeQue<float>();
+        LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
+        while (curIndex <= curIndexEnd && curIndex / this->k_ == row) {
+            int32_t outIndex = expandedRowIdx.GetValue(curIndex);
+            curIndex++;
+            if (outIndex == -1 || this->dropPadMode_ == DROPLESS_MODE && outIndex >= this->activeNum_) {
+                continue;
+            }
+            DataCopyPad(expandedXGm_[outIndex * this->cols_], outLocal, intriParams);
+            DataCopyPad(expandedScaleGm_[outIndex], quantScaleLocal, quantScaleParams);
+        }
+
+        xCopyInQueue_.FreeTensor(xLocal);
+        inputXOutQueue_.FreeTensor(outLocal);
+        scaleOutQueue_.FreeTensor(quantScaleLocal);
+    }
+
+    smoothInQueue_.FreeTensor(smoothLocal);
+    this->expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx);
+}
+
+template <typename T, const int COPYOUTTYPE, const int SMOOTHTYPE>
+__aicore__ inline void MoeCustomFullLoadDynamicQuant<T, COPYOUTTYPE, SMOOTHTYPE>::FreeLocalTensor()
+{
+    if constexpr (!COPYOUTTYPE) {
+        LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
+        this->expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
+    }
+    LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
+    LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
+    this->expandDstToSrcRowQueue_.FreeTensor(sortedRowIdx);
+    this->expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdx);
+}
+
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_FULL_LOAD_DYNAMIC_QUANT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_static_quant.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_static_quant.h
@@ -0,0 +1,229 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_static_quant_full_load.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_FULL_LOAD_STATIC_QUANT_H
+#define MOE_CUSTOM_FULL_LOAD_STATIC_QUANT_H
+
+#include "moe_custom_full_load_base.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+template <typename T>
+class MoeCustomFullLoadStaticQuant : public MoeCustomFullLoadBase<T> {
+public:
+    __aicore__ inline MoeCustomFullLoadStaticQuant(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedX,
+                                GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyOutXStaticQuant();
+    __aicore__ inline void FreeLocalTensor();
+    __aicore__ inline void ComputeQuant(int64_t xLocalLength);
+
+private:
+    TQue<QuePosition::VECIN, 1> xCopyInQueue_;
+    TQue<QuePosition::VECOUT, 1> floatQueue_;
+    TQue<QuePosition::VECOUT, 1> halfQueue_;
+    TQue<QuePosition::VECOUT, 1> inputXOutQueue_;
+
+    GlobalTensor<T> xGm_;
+    GlobalTensor<int8_t> expandedXGm_;
+    GlobalTensor<float> scaleGm_;
+    GlobalTensor<float> offsetGm_;
+
+    float scale_;
+    float offset_;
+};
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset,
+                                                         GM_ADDR expandedX, GM_ADDR expandedRowIdx,
+                                                         GM_ADDR expertTokensCountOrCumsum, GM_ADDR workspace,
+                                                         const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    MoeCustomFullLoadBase<T>::Init(expertIdx, expandedRowIdx, expertTokensCountOrCumsum, workspace, tilingData, tPipe);
+
+    xGm_.SetGlobalBuffer((__gm__ T *)x);
+    expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
+    scaleGm_.SetGlobalBuffer((__gm__ float *)scale, 1);
+    offsetGm_.SetGlobalBuffer((__gm__ float *)offset, 1);
+    this->scale_ = scaleGm_.GetValue(0);
+    this->offset_ = offsetGm_.GetValue(0);
+    SetWaitFlag<HardEvent::S_V>(HardEvent::S_V);
+    int64_t curIndexStart = this->blockIdx_ * this->perCoreIndicesElements_;
+    int64_t rowLength = 0;
+    if (this->ep_) {
+        rowLength = 1;
+    } else {
+        rowLength = (curIndexStart + this->coreIndicesElements_ - 1) / this->k_ - curIndexStart / this->k_ + 1;
+    }
+    int64_t xAlignedCount = Align(this->cols_, sizeof(int8_t));
+    this->pipe_->InitBuffer(xCopyInQueue_, this->bufferNum_, xAlignedCount * sizeof(T) * rowLength);
+    this->pipe_->InitBuffer(inputXOutQueue_, 1, xAlignedCount * sizeof(int8_t) * rowLength);
+    this->pipe_->InitBuffer(floatQueue_, 1, xAlignedCount * sizeof(float) * rowLength);
+    this->pipe_->InitBuffer(halfQueue_, 1, xAlignedCount * sizeof(half) * rowLength);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::Process()
+{
+    if (this->blockIdx_ < this->needCoreNum_) {
+        this->CopyIn();
+        this->Compute();
+
+        // vaild expert equal zero
+        if (this->needCoreNum_ < 1) {
+            if (this->blockIdx_ == 0) {
+                if (this->rowIdxType_ == GATHER) {
+                    this->CopyOutDefaultGatherIdx();
+                }
+                if (this->expertTokensNumFlag_ == 1) {
+                    this->CopyOutDefaultTokenCountOrCumsum();
+                }
+            }
+            return;
+        }
+
+        if (this->blockIdx_ == 0) {
+            this->CopyOutIdx();
+        }
+        if (this->blockIdx_ == this->needCoreNum_ - 1 && this->expertTokensNumFlag_ == 1) {
+            this->ComputeExpertTokenCountOrCumsum();
+        }
+        if (this->blockIdx_ < this->needCoreNum_) {
+            CopyOutXStaticQuant();
+        }
+        FreeLocalTensor();
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::ComputeQuant(int64_t xLocalLength)
+{
+    LocalTensor<float> floatLocal;
+    LocalTensor<T> inLocal;
+    LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
+    LocalTensor<half> halfLocal = halfQueue_.AllocTensor<half>();
+    uint64_t elements = Align(this->cols_, sizeof(int8_t)) * xLocalLength;
+    if constexpr (IsSameType<T, float>::value) {
+        floatLocal = this->xCopyInQueue_.template DeQue<float>();
+    } else {
+        inLocal = this->xCopyInQueue_.template DeQue<T>();
+        floatLocal = floatQueue_.AllocTensor<float>();
+        Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
+        PipeBarrier<PIPE_V>();
+    }
+    Muls(floatLocal, floatLocal, this->scale_, elements);
+    PipeBarrier<PIPE_V>();
+    Adds(floatLocal, floatLocal, this->offset_, elements);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
+    Cast(intLocal, floatLocal, RoundMode::CAST_RINT, elements);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    Cast(halfLocal, intLocal, RoundMode::CAST_ROUND, elements);
+    PipeBarrier<PIPE_V>();
+    Cast(outLocal, halfLocal, RoundMode::CAST_TRUNC, elements);
+    inputXOutQueue_.EnQue(outLocal);
+    if constexpr (IsSameType<T, float>::value) {
+        this->xCopyInQueue_.FreeTensor(floatLocal);
+    } else {
+        this->xCopyInQueue_.FreeTensor(inLocal);
+        floatQueue_.FreeTensor(floatLocal);
+    }
+
+    halfQueue_.FreeTensor(halfLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::CopyOutXStaticQuant()
+{
+    int64_t curIndex = this->curIndexStart_;
+    int64_t curIndexEnd = curIndex + this->coreIndicesElements_ - 1;
+
+    if (this->ep_) {
+        LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
+        LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
+
+        DataCopyExtParams dataXCopyParams{1, static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
+        DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
+
+        for (int64_t dstIndex = curIndex; dstIndex <= curIndexEnd; dstIndex++) {
+            if (this->dropPadMode_ == DROPLESS_MODE && dstIndex >= this->activeNum_) {
+                break;
+            }
+            int32_t srcIdx = sortedRowIdx.GetValue(dstIndex);
+            int32_t expertIdx = expandedExpertIdx.GetValue(dstIndex);
+            if (expertIdx < this->expertStart_ || expertIdx >= this->expertEnd_) {
+                break;
+            }
+            LocalTensor<T> inLocal = this->xCopyInQueue_.template AllocTensor<T>();
+            // copyinx
+            DataCopyPad(inLocal, this->xGm_[srcIdx / this->k_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
+            this->xCopyInQueue_.template EnQue<T>(inLocal);
+            ComputeQuant(1);
+
+            LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
+            DataCopyPad(this->expandedXGm_[dstIndex * this->cols_], outLocal, intriParams);
+            inputXOutQueue_.FreeTensor(outLocal);
+        }
+        this->expandDstToSrcRowQueue_.EnQue(sortedRowIdx);
+        this->expandedExpertIdxCopyOutQueue_.EnQue(expandedExpertIdx);
+    } else {
+        LocalTensor<T> xLocal = this->xCopyInQueue_.template AllocTensor<T>();
+        LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
+        int64_t inFactor = Align(this->cols_, sizeof(int8_t));
+        uint32_t dstStride = (inFactor * sizeof(T) - AlignBytes(this->cols_, sizeof(T))) / BLOCK_BYTES;
+        DataCopyExtParams dataXCopyParams{static_cast<uint16_t>(this->endXRow_ - this->startXRow_ + 1),
+                                          static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, dstStride, 0};
+        DataCopyPad(xLocal, this->xGm_[this->startXRow_ * this->cols_], dataXCopyParams, {false, 0, 0, 0});
+        this->xCopyInQueue_.EnQue(xLocal);
+        SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
+        ComputeQuant(this->endXRow_ - this->startXRow_ + 1);
+
+        LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
+        int64_t k = 0;
+        DataCopyExtParams intriParams{1, static_cast<uint32_t>(this->cols_ * sizeof(int8_t)), 0, 0, 0};
+        for (int64_t i = this->startXRow_; i <= this->endXRow_; i++) {
+            for (; k < this->coreIndicesElements_ && curIndex / this->k_ == i; curIndex++, k++) {
+                int32_t outIndex = expandedRowIdx.GetValue(curIndex);
+                if (outIndex < this->activeNum_) {
+                    DataCopyPad(this->expandedXGm_[outIndex * this->cols_], outLocal[(i - this->startXRow_) * inFactor],
+                                intriParams);
+                }
+            }
+        }
+        inputXOutQueue_.FreeTensor(outLocal);
+        this->expandedRowIdxCopyOutQueue_.EnQue(expandedRowIdx);
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadStaticQuant<T>::FreeLocalTensor()
+{
+    if (!this->ep_) {
+        LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
+        this->expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
+    }
+    LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
+    this->expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdx);
+    LocalTensor<int32_t> sortedRowIdx = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
+    this->expandDstToSrcRowQueue_.FreeTensor(sortedRowIdx);
+}
+
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_FULL_LOAD_STATIC_QUANT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_unquantized.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_full_load_unquantized.h
@@ -0,0 +1,224 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_full_load_unquantized.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_FULL_LOAD_UNQUANTIZED_H
+#define MOE_CUSTOM_FULL_LOAD_UNQUANTIZED_H
+
+#include "moe_custom_full_load_base.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+template <typename T>
+class MoeCustomFullLoadUnquantized : public MoeCustomFullLoadBase<T> {
+public:
+    __aicore__ inline MoeCustomFullLoadUnquantized(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expandedRowIdx,
+                                GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+protected:
+    __aicore__ inline void FreeLocalTensor();
+    __aicore__ inline void GatherOutX();
+    __aicore__ inline void CopyOutScale();
+
+protected:
+    TQue<QuePosition::VECIN, 1> xCopyInQueue_;
+    TQue<QuePosition::VECIN, 1> scaleCopyInQueue_;
+
+    GlobalTensor<T> xGm_;
+    GlobalTensor<float> scaleGm_;
+    GlobalTensor<T> expandedXGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<float> expandedScaleGm_;
+};
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadUnquantized<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX,
+                                                         GM_ADDR expandedRowIdx, GM_ADDR expertTokensCountOrCumsum,
+                                                         GM_ADDR expandedScale, GM_ADDR workspace,
+                                                         const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    MoeCustomFullLoadBase<T>::Init(expertIdx, expandedRowIdx, expertTokensCountOrCumsum, workspace, tilingData, tPipe);
+    xGm_.SetGlobalBuffer((__gm__ T *)x);
+    if (this->isInputScale_) {
+        scaleGm_.SetGlobalBuffer((__gm__ float *)scale);
+        expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
+    }
+
+    expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX);
+    int64_t buffSize = this->sortNum_ * sizeof(int32_t);
+    int64_t row_length =
+        (this->curIndexStart_ + this->coreIndicesElements_ - 1) / this->k_ - this->curIndexStart_ / this->k_ + 1;
+
+    if (this->ep_) {
+        this->pipe_->InitBuffer(xCopyInQueue_, this->bufferNum_, AlignBytes(this->cols_, sizeof(T)));
+    } else {
+        this->pipe_->InitBuffer(xCopyInQueue_, this->bufferNum_, AlignBytes(this->cols_, sizeof(T)) * row_length);
+    }
+    this->pipe_->InitBuffer(scaleCopyInQueue_, 1, AlignBytes(1, sizeof(float)));
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadUnquantized<T>::Process()
+{
+    if (this->blockIdx_ < this->needCoreNum_) {
+        this->CopyIn();
+        this->Compute();
+
+        // vaild expert equal zero
+        if (this->needCoreNum_ < 1) {
+            if (this->blockIdx_ == 0) {
+                if (this->rowIdxType_ == GATHER) {
+                    this->CopyOutDefaultGatherIdx();
+                }
+                if (this->expertTokensNumFlag_ == 1) {
+                    this->CopyOutDefaultTokenCountOrCumsum();
+                }
+            }
+            return;
+        }
+
+        if (this->blockIdx_ == 0) {
+            this->CopyOutIdx();
+        }
+
+        if (this->blockIdx_ == this->needCoreNum_ - 1 && this->expertTokensNumFlag_ == 1) {
+            this->ComputeExpertTokenCountOrCumsum();
+        }
+
+        if (this->blockIdx_ < this->needCoreNum_) {
+            this->GatherOutX();
+            if (this->isInputScale_) {
+                this->CopyOutScale();
+            }
+        }
+
+        this->FreeLocalTensor();
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadUnquantized<T>::GatherOutX()
+{
+    if (this->ep_) {
+        LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
+        LocalTensor<int32_t> expandDstToSrcRowLocal = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
+        int64_t startRowIdx = this->blockIdx_ * this->perCoreIndicesElements_;
+        int64_t endRowIdx = startRowIdx + this->coreIndicesElements_;
+        LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
+        DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
+        DataCopyPadExtParams<T> padParams{false, 0, 0, 0};
+        for (int64_t i = startRowIdx; i < endRowIdx && i < this->activeNum_; i++) {
+            int32_t curExpertId = expandedExpertIdx.GetValue(i);
+            if (curExpertId < this->expertStart_ || curExpertId >= this->expertEnd_) {
+                break;
+            }
+            int64_t rowIdx = expandDstToSrcRowLocal.GetValue(i);
+            int64_t srcOffset = rowIdx / this->k_ * this->cols_;
+            int64_t dstOffset = i * this->cols_;
+            SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+            DataCopyPad(xLocal, xGm_[srcOffset], copyParams, padParams);
+            SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+            DataCopyPad(expandedXGm_[dstOffset], xLocal, copyParams);
+        }
+        xCopyInQueue_.FreeTensor(xLocal);
+        this->expandedExpertIdxCopyOutQueue_.template EnQue<int32_t>(expandedExpertIdx);
+        this->expandDstToSrcRowQueue_.template EnQue<int32_t>(expandDstToSrcRowLocal);
+    } else {
+        LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
+        DataCopyExtParams dataXCopyParams{static_cast<uint16_t>(this->endXRow_ - this->startXRow_ + 1),
+                                          static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
+        DataCopyPadExtParams<T> dataXCopyPadParams{false, 0, 0, 0};
+        DataCopyPad(xLocal, xGm_[this->startXRow_ * this->cols_], dataXCopyParams, dataXCopyPadParams);
+        SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+        int64_t inFactor = Align(this->cols_, sizeof(T));
+        DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(this->cols_ * sizeof(T)), 0, 0, 0};
+        LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
+        int64_t curIndexStart = this->curIndexStart_;
+        int64_t k = 0;
+        for (int64_t i = this->startXRow_; i <= this->endXRow_; i++) {
+            for (; k < this->coreIndicesElements_ && curIndexStart / this->k_ == i; curIndexStart++, k++) {
+                int32_t outIndex = expandedRowIdx.GetValue(curIndexStart);
+                if (outIndex < this->activeNum_) {
+                    DataCopyPad(expandedXGm_[outIndex * this->cols_], xLocal[(i - this->startXRow_) * inFactor],
+                                copyParams);
+                }
+            }
+        }
+        xCopyInQueue_.FreeTensor(xLocal);
+        this->expandedRowIdxCopyOutQueue_.template EnQue<int32_t>(expandedRowIdx);
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadUnquantized<T>::FreeLocalTensor()
+{
+    LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
+    LocalTensor<int32_t> expandDstToSrcRowLocal = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
+    this->expandedExpertIdxCopyOutQueue_.FreeTensor(expandedExpertIdx);
+    this->expandDstToSrcRowQueue_.FreeTensor(expandDstToSrcRowLocal);
+    if (!this->ep_) {
+        LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
+        this->expandedRowIdxCopyOutQueue_.FreeTensor(expandedRowIdx);
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeCustomFullLoadUnquantized<T>::CopyOutScale()
+{
+    LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+    DataCopyPadExtParams<float> padParams{false, 0, 0, 0};
+    if (this->ep_) {
+        LocalTensor<int32_t> expandedExpertIdx = this->expandedExpertIdxCopyOutQueue_.template DeQue<int32_t>();
+        LocalTensor<int32_t> expandDstToSrcRowLocal = this->expandDstToSrcRowQueue_.template DeQue<int32_t>();
+        int64_t startRowIdx = this->blockIdx_ * this->perCoreIndicesElements_;
+        int64_t endRowIdx = startRowIdx + this->coreIndicesElements_;
+        for (int64_t i = startRowIdx; i < endRowIdx && i < this->activeNum_; i++) {
+            int32_t curExpertId = expandedExpertIdx.GetValue(i);
+            if (curExpertId < this->expertStart_ || curExpertId >= this->expertEnd_) {
+                break;
+            }
+            int64_t rowIdx = expandDstToSrcRowLocal.GetValue(i);
+            SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+            DataCopyPad(scaleLocal, scaleGm_[rowIdx / this->k_], copyParams, padParams);
+            SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+            DataCopyPad(expandedScaleGm_[i], scaleLocal, copyParams);
+        }
+        this->expandedExpertIdxCopyOutQueue_.template EnQue<int32_t>(expandedExpertIdx);
+        this->expandDstToSrcRowQueue_.template EnQue<int32_t>(expandDstToSrcRowLocal);
+    } else {
+        LocalTensor<int32_t> expandedRowIdx = this->expandedRowIdxCopyOutQueue_.template DeQue<int32_t>();
+        int64_t curIndexStart = this->curIndexStart_;
+        int64_t k = 0;
+        for (int64_t i = this->startXRow_; i <= this->endXRow_; i++) {
+            SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+            DataCopyPad(scaleLocal, scaleGm_[i], copyParams, padParams);
+            SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+            for (; k < this->coreIndicesElements_ && curIndexStart / this->k_ == i; curIndexStart++, k++) {
+                int32_t outIndex = expandedRowIdx.GetValue(curIndexStart);
+                if (outIndex < this->activeNum_) {
+                    DataCopyPad(expandedScaleGm_[outIndex], scaleLocal, copyParams);
+                }
+            }
+        }
+        this->expandedRowIdxCopyOutQueue_.template EnQue<int32_t>(expandedRowIdx);
+    }
+    scaleCopyInQueue_.FreeTensor(scaleLocal);
+}
+
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_FULL_LOAD_UNQUANTIZED_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_droppad_static_quant.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_droppad_static_quant.h
@@ -0,0 +1,238 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_gather_droppad_static_quant.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_GATHER_DROPPAD_STATIC_QUANT_H
+#define MOE_CUSTOM_GATHER_DROPPAD_STATIC_QUANT_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+constexpr int64_t GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM = 2;
+
+template <typename T>
+class MoeGatherDroppadQuant {
+public:
+    __aicore__ inline MoeGatherDroppadQuant(){};
+    __aicore__ inline void Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedRowIdx,
+                                GM_ADDR expandedX, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
+                                TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyExpertIn(int64_t progress);
+    __aicore__ inline void Compute();
+    __aicore__ inline void CopyXIn(int64_t xSrcOffset, int64_t curLoopCols);
+    __aicore__ inline void CopyOut(int64_t progress);
+
+private:
+    TPipe *pipe_;
+    TQue<QuePosition::VECIN, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM> inputXCopyInQueue_;
+    TQue<QuePosition::VECIN, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM> expandRowIdxCopyInQueue_;
+    TQue<QuePosition::VECOUT, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM> inputXCopyOutQueue_;
+    TQue<QuePosition::VECOUT, 1> floatQueue_;
+    TQue<QuePosition::VECOUT, 1> halfQueue_;
+
+    GlobalTensor<T> inputXGm_;
+    GlobalTensor<int8_t> expandedXGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<float> scaleGm_;
+    GlobalTensor<float> offsetGm_;
+
+    const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
+
+    int64_t needCoreNum_;
+    int64_t blockIdx_;
+    int64_t cols_;
+    int64_t n_;
+    int64_t k_;
+    int64_t currentLoopRows_;
+    int64_t coreRows_;
+    int64_t perLoopRows_;
+    int64_t lastLoopRows_;
+    int64_t rowLoops_;
+    int64_t colsTileLength_;
+    int64_t perLoopCols_;
+    int64_t lastLoopCols_;
+    int64_t colLoops_;
+    float scale_;
+    float offset_;
+
+    int64_t indicesOffset_;
+    int64_t inputOffset_;
+    int64_t outOffset_;
+};
+
+template <typename T>
+__aicore__ inline void MoeGatherDroppadQuant<T>::CopyExpertIn(int64_t progress)
+{
+    indicesOffset_ = progress * perLoopRows_;
+    LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
+    expandRowIdxCopyInQueue_.EnQue<int32_t>(indicesLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherDroppadQuant<T>::CopyXIn(int64_t xSrcOffset, int64_t curLoopCols)
+{
+    LocalTensor<T> inLocal = inputXCopyInQueue_.AllocTensor<T>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
+    DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(inLocal, inputXGm_[xSrcOffset], dataCopyParams, dataCopyPadParams);
+    inputXCopyInQueue_.EnQue(inLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherDroppadQuant<T>::Compute()
+{
+    LocalTensor<float> floatLocal;
+    LocalTensor<T> inLocal;
+    LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.AllocTensor<int8_t>();
+    LocalTensor<half> halfLocal = halfQueue_.AllocTensor<half>();
+    uint32_t elements = Align(colsTileLength_, sizeof(T));
+    if constexpr (IsSameType<T, float>::value) {
+        floatLocal = inputXCopyInQueue_.DeQue<float>();
+    } else {
+        inLocal = inputXCopyInQueue_.DeQue<T>();
+        floatLocal = floatQueue_.AllocTensor<float>();
+        Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
+        PipeBarrier<PIPE_V>();
+    }
+    Muls(floatLocal, floatLocal, scale_, elements);
+    PipeBarrier<PIPE_V>();
+    Adds(floatLocal, floatLocal, offset_, elements);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
+    Cast(intLocal, floatLocal, RoundMode::CAST_RINT, elements);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    PipeBarrier<PIPE_V>();
+    Cast(halfLocal, intLocal, RoundMode::CAST_ROUND, elements);
+    PipeBarrier<PIPE_V>();
+    Cast(outLocal, halfLocal, RoundMode::CAST_TRUNC, elements);
+    inputXCopyOutQueue_.EnQue(outLocal);
+    if constexpr (IsSameType<T, float>::value) {
+        inputXCopyInQueue_.FreeTensor(floatLocal);
+    } else {
+        inputXCopyInQueue_.FreeTensor(inLocal);
+        floatQueue_.FreeTensor(floatLocal);
+    }
+    halfQueue_.FreeTensor(halfLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherDroppadQuant<T>::CopyOut(int64_t progress)
+{
+    LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.DeQue<int32_t>();
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    colsTileLength_ = perLoopCols_;
+    for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
+        int64_t initialRow = gatherOutTilingData_->perCoreIndicesElements * blockIdx_ + perLoopRows_ * progress;
+        int64_t curLoopRow = 0;
+        if (colsLoop == colLoops_ - 1) {
+            colsTileLength_ = lastLoopCols_;
+        }
+        int64_t currentLoopStartRow = initialRow / k_;
+        int64_t currentLoopLastRow = (initialRow + currentLoopRows_ - 1) / k_;
+        for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
+            inputOffset_ = row * cols_ + colsLoop * perLoopCols_;
+            // input row position
+            CopyXIn(inputOffset_, colsTileLength_);
+            Compute();
+            LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.DeQue<int8_t>();
+            DataCopyExtParams intriParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(int8_t)), 0, 0, 0};
+            while (curLoopRow < currentLoopRows_ && initialRow / k_ == row) {
+                int32_t outIndex = indicesLocal.GetValue(curLoopRow);
+                curLoopRow++;
+                initialRow++;
+                if (outIndex == -1) {
+                    continue;
+                }
+                outOffset_ = outIndex * cols_ + colsLoop * perLoopCols_;
+                DataCopyPad(expandedXGm_[outOffset_], outLocal, intriParams);
+            }
+            inputXCopyOutQueue_.FreeTensor(outLocal);
+        }
+    }
+    expandRowIdxCopyInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherDroppadQuant<T>::Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset,
+                                                      GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR workspace,
+                                                      const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    pipe_ = tPipe;
+    blockIdx_ = GetBlockIdx();
+    gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
+
+    needCoreNum_ = gatherOutTilingData_->needCoreNum;
+    cols_ = tilingData->cols;
+    n_ = tilingData->n;
+    k_ = tilingData->k;
+
+    if (blockIdx_ == needCoreNum_ - 1) {
+        coreRows_ = gatherOutTilingData_->lastCoreIndicesElements;
+        perLoopRows_ = gatherOutTilingData_->lastCorePerLoopIndicesElements;
+        lastLoopRows_ = gatherOutTilingData_->lastCoreLastLoopIndicesElements;
+        rowLoops_ = gatherOutTilingData_->lastCoreIndicesLoops;
+    } else {
+        coreRows_ = gatherOutTilingData_->perCoreIndicesElements;
+        perLoopRows_ = gatherOutTilingData_->perCorePerLoopIndicesElements;
+        lastLoopRows_ = gatherOutTilingData_->perCoreLastLoopIndicesElements;
+        rowLoops_ = gatherOutTilingData_->perCoreIndicesLoops;
+    }
+    perLoopCols_ = gatherOutTilingData_->perLoopCols;
+    lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
+    colLoops_ = gatherOutTilingData_->colsLoops;
+
+    inputXGm_.SetGlobalBuffer((__gm__ T *)inputX);
+    expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
+    expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx +
+                                          blockIdx_ * gatherOutTilingData_->perCoreIndicesElements,
+                                      Align(coreRows_, sizeof(int32_t)));
+    scaleGm_.SetGlobalBuffer((__gm__ float *)scale, 1);
+    offsetGm_.SetGlobalBuffer((__gm__ float *)offset, 1);
+    scale_ = scaleGm_.GetValue(0);
+    offset_ = offsetGm_.GetValue(0);
+
+    pipe_->InitBuffer(inputXCopyInQueue_, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
+    pipe_->InitBuffer(inputXCopyOutQueue_, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM,
+                      AlignBytes(perLoopCols_, sizeof(int8_t)));
+    pipe_->InitBuffer(expandRowIdxCopyInQueue_, GATHER_OUT_DROPPAD_QUANT_BUFFER_NUM,
+                      AlignBytes(perLoopRows_, sizeof(int32_t)));
+    pipe_->InitBuffer(floatQueue_, 1, AlignBytes(perLoopCols_, sizeof(float)));
+    pipe_->InitBuffer(halfQueue_, 1, AlignBytes(perLoopCols_, sizeof(half)));
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherDroppadQuant<T>::Process()
+{
+    if (blockIdx_ < needCoreNum_) {
+        currentLoopRows_ = perLoopRows_;
+        for (int64_t loop = 0; loop < rowLoops_; loop++) {
+            if (loop == rowLoops_ - 1) {
+                currentLoopRows_ = lastLoopRows_;
+            }
+            CopyExpertIn(loop);
+            CopyOut(loop);
+        }
+    }
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_GATHER_DROPPAD_STATIC_QUANT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_dynamic_quant.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_dynamic_quant.h
@@ -0,0 +1,602 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_gather_dynamic_quant.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_GATHER_DYNAMIC_QUANT_H
+#define MOE_CUSTOM_GATHER_DYNAMIC_QUANT_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+constexpr int64_t GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM = 2;
+
+template <typename T, const int COPYOUTTYPE>
+class MoeGatherOutDynamicQuant {
+public:
+    __aicore__ inline MoeGatherOutDynamicQuant(){};
+    __aicore__ inline void Init(GM_ADDR inputX, GM_ADDR quantSmooth, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
+                                GM_ADDR expandedScale, GM_ADDR sortedExpertIdx,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyOutXDynamicQuantFromGather(int64_t progress);
+    __aicore__ inline void CopyOutXDynamicQuantFromScatter(int64_t progress);
+    __aicore__ inline void CopyOutXPartialDynamicQuantFromGather(int64_t progress);
+    __aicore__ inline void CopyOutXPartialDynamicQuantFromScatter(int64_t progress);
+    __aicore__ inline void CopyInExpandedExpertIdx(int64_t progress);
+    __aicore__ inline void Compute(LocalTensor<float> &smoothLocal);
+    __aicore__ inline float ComputeMax(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
+                                       LocalTensor<float> &scaleLocal, int32_t srcIdx, int32_t expertIdx, int64_t j);
+    __aicore__ inline void ComputeScale(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal, float scaleTemp,
+                                        int64_t dstIndex, int64_t j);
+
+private:
+    TPipe *pipe_;
+    TQue<QuePosition::VECIN, 1> inputXInQueue_;
+    TQue<QuePosition::VECIN, 1> smoothInQueue_;
+    TQue<QuePosition::VECIN, 1> expandRowIdxInQueue_;
+    TQue<QuePosition::VECOUT, 1> calcQueue_;
+    TQue<QuePosition::VECOUT, 1> inputXOutQueue_;
+    TQue<QuePosition::VECOUT, 1> scaleOutQueue_;
+
+    GlobalTensor<T> inputXGm_;
+    GlobalTensor<int8_t> expandedXGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<float> quantSmoothGm_;
+    GlobalTensor<float> expandedScaleGm_;
+    GlobalTensor<float> quantTempGm_;
+    GlobalTensor<int32_t> expandedExpertIdxGm_;
+    GlobalTensor<int32_t> expertTotalCountGm_;
+
+    const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
+
+    int64_t needCoreNum_;
+    int64_t blockIdx_;
+    int64_t cols_;
+    int64_t n_;
+    int64_t k_;
+    int64_t totalLength_;
+    int64_t perCoreRow_;
+    int64_t currentLoopRows_;
+    int64_t currentLoopRowsAlign_;
+    int64_t coreRows_;
+    int64_t perLoopRows_;
+    int64_t lastLoopRows_;
+    int64_t rowLoops_;
+    int64_t colsTileLength_;
+    int64_t perLoopCols_;
+    int64_t perLoopColsAlign_;
+    int64_t lastLoopCols_;
+    int64_t colLoops_;
+    int64_t isInputScale_;
+    int64_t expertStart_;
+
+    int64_t indicesOffset_;
+    int64_t rowIdxType_ = 0;
+    int64_t dropPadMode_;
+    int64_t activeNum_;
+    int64_t ep_;
+    int64_t smoothType_;
+    int64_t coreNum_;
+    int64_t expertTotalCount_ = 0;
+};
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyInExpandedExpertIdx(int64_t progress)
+{
+    indicesOffset_ = progress * perLoopRows_;
+    LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
+    DataCopyPad(indicesLocal[currentLoopRowsAlign_], expandedExpertIdxGm_[indicesOffset_], dataCopyParams,
+                dataCopyPadParams);
+    expandRowIdxInQueue_.EnQue<int32_t>(indicesLocal);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::Compute(LocalTensor<float> &smoothLocal)
+{
+    LocalTensor<float> inLocal = inputXInQueue_.DeQue<float>();
+
+    LocalTensor<float> tempLocal = calcQueue_.AllocTensor<float>();
+    LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
+    LocalTensor<float> scaleLocal = scaleOutQueue_.AllocTensor<float>();
+
+    if constexpr (!IsSameType<T, float>::value) {
+        Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign_], RoundMode::CAST_NONE, cols_);
+        PipeBarrier<PIPE_V>();
+    }
+
+    if (isInputScale_) {
+        Mul(inLocal, inLocal, smoothLocal, cols_);
+        PipeBarrier<PIPE_V>();
+    }
+
+    Abs(tempLocal, inLocal, cols_);
+    PipeBarrier<PIPE_V>();
+
+    ReduceMax(scaleLocal, tempLocal, tempLocal, cols_); // get max value and index [0,1]
+
+    float scaleValue = scaleLocal.GetValue(0) / MAX_INT8;
+
+    Duplicate<float>(scaleLocal, scaleValue, INT32_ONE_BLOCK_NUM);
+    PipeBarrier<PIPE_V>();
+    Duplicate<float>(tempLocal, scaleValue, cols_);
+    PipeBarrier<PIPE_V>();
+
+    Div(tempLocal, inLocal, tempLocal, cols_);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<int32_t> intLocal = tempLocal.ReinterpretCast<int32_t>();
+    Cast(intLocal, tempLocal, RoundMode::CAST_RINT, cols_);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    Cast(intLocal.ReinterpretCast<half>(), intLocal, RoundMode::CAST_ROUND, cols_);
+    PipeBarrier<PIPE_V>();
+    Cast(outLocal, intLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, cols_);
+
+    calcQueue_.FreeTensor(tempLocal);
+    inputXOutQueue_.EnQue(outLocal);
+    scaleOutQueue_.EnQue(scaleLocal);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXDynamicQuantFromScatter(int64_t progress)
+{
+    DataCopyExtParams copyInParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(T)), 0, 0, 0};
+    DataCopyExtParams smoothParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(int8_t)), 0, 0, 0};
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+    LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
+    LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
+
+    // copyin [1,H] scale
+    if (smoothType_ == SCALE_1H) {
+        DataCopyPad(smoothLocal, quantSmoothGm_, smoothParams, {false, 0, 0, 0});
+        smoothInQueue_.EnQue(smoothLocal);
+        smoothLocal = smoothInQueue_.DeQue<float>();
+    }
+
+    int32_t lastExpertIdx = -1;
+    for (int64_t i = 0; i < currentLoopRows_; i++) {
+        int64_t rowOffset = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
+        if (dropPadMode_ == DROPLESS_MODE && (rowOffset + i) >= activeNum_) {
+            break;
+        }
+        LocalTensor<T> inLocal = inputXInQueue_.AllocTensor<T>();
+        int32_t srcIdx = indicesLocal.GetValue(i);
+
+        int32_t expertIdx = indicesLocal.GetValue(currentLoopRowsAlign_ + i) - expertStart_;
+        if constexpr (IsSameType<T, float>::value) {
+            DataCopyPad(inLocal, inputXGm_[srcIdx / k_ * cols_], copyInParams, {false, 0, 0, 0});
+        } else {
+            DataCopyPad(inLocal[perLoopColsAlign_], inputXGm_[srcIdx / k_ * cols_], copyInParams, {false, 0, 0, 0});
+        }
+        inputXInQueue_.EnQue<T>(inLocal);
+
+        // copyin dynamic scale
+        if (smoothType_ == SCALE_EH && expertIdx != lastExpertIdx) {
+            DataCopyPad(smoothLocal, quantSmoothGm_[expertIdx * this->cols_], smoothParams, {false, 0, 0, 0});
+            smoothInQueue_.EnQue(smoothLocal);
+            smoothLocal = smoothInQueue_.DeQue<float>();
+            lastExpertIdx = expertIdx;
+        }
+        Compute(smoothLocal);
+        inputXInQueue_.FreeTensor(inLocal);
+        LocalTensor<float> scaleLocal = scaleOutQueue_.DeQue<float>();
+        DataCopyPad(expandedScaleGm_[(rowOffset + i)], scaleLocal, quantScaleParams);
+        LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
+        DataCopyPad(expandedXGm_[(rowOffset + i) * cols_], outLocal, copyOutParams);
+
+        inputXOutQueue_.FreeTensor(outLocal);
+        scaleOutQueue_.FreeTensor(scaleLocal);
+    }
+
+    smoothInQueue_.FreeTensor(smoothLocal);
+    expandRowIdxInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXDynamicQuantFromGather(int64_t progress)
+{
+    DataCopyExtParams copyInParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(T)), 0, 0, 0};
+    DataCopyExtParams smoothParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(perLoopCols_ * sizeof(int8_t)), 0, 0, 0};
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
+    LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
+
+    int64_t rowOffset = blockIdx_ * perCoreRow_ + progress * perLoopRows_;
+    int64_t startXRow = rowOffset / k_;
+    int64_t endXRow = (rowOffset + currentLoopRows_ - 1) / k_;
+    int64_t curIndex = 0;
+
+    if (smoothType_ == SCALE_1H) {
+        DataCopyPad(smoothLocal, quantSmoothGm_, smoothParams, {false, 0, 0, 0});
+        smoothInQueue_.EnQue(smoothLocal);
+        smoothLocal = smoothInQueue_.DeQue<float>();
+    }
+
+    for (int64_t row = startXRow; row <= endXRow; row++) {
+        LocalTensor<T> inLocal = inputXInQueue_.AllocTensor<T>();
+        if constexpr (IsSameType<T, float>::value) {
+            DataCopyPad(inLocal, inputXGm_[row * cols_], copyInParams, {false, 0, 0, 0});
+        } else {
+            DataCopyPad(inLocal[perLoopColsAlign_], inputXGm_[row * cols_], copyInParams, {false, 0, 0, 0});
+        }
+        inputXInQueue_.EnQue<T>(inLocal);
+        Compute(smoothLocal);
+        LocalTensor<float> scaleLocal = scaleOutQueue_.DeQue<float>();
+        LocalTensor<int8_t> outLocal = inputXOutQueue_.DeQue<int8_t>();
+
+        while (curIndex < currentLoopRows_ && (rowOffset + curIndex) / this->k_ == row) {
+            int32_t outIndex = indicesLocal.GetValue(curIndex);
+            curIndex++;
+            if (outIndex == -1 || dropPadMode_ == DROPLESS_MODE && outIndex >= this->activeNum_) {
+                continue;
+            }
+            DataCopyPad(expandedXGm_[outIndex * cols_], outLocal, copyOutParams);
+            DataCopyPad(expandedScaleGm_[outIndex], scaleLocal, quantScaleParams);
+        }
+
+        inputXInQueue_.FreeTensor(inLocal);
+        inputXOutQueue_.FreeTensor(outLocal);
+        scaleOutQueue_.FreeTensor(scaleLocal);
+    }
+
+    smoothInQueue_.FreeTensor(smoothLocal);
+    expandRowIdxInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline float
+MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::ComputeMax(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
+                                                     LocalTensor<float> &scaleLocal, int32_t srcIdx, int32_t expertIdx,
+                                                     int64_t j)
+{
+    LocalTensor<float> smoothLocal = smoothInQueue_.AllocTensor<float>();
+
+    DataCopyExtParams intriParamsT{1, static_cast<uint32_t>(colsTileLength_ * sizeof(T)), 0, 0, 0};
+    DataCopyExtParams intriParamsFp32{1, static_cast<uint32_t>(colsTileLength_ * sizeof(float)), 0, 0, 0};
+
+    if constexpr (!IsSameType<T, float>::value) {
+        DataCopyPad(inLocal.ReinterpretCast<T>()[perLoopColsAlign_], inputXGm_[srcIdx * cols_ + j * perLoopCols_],
+                    intriParamsT, {false, 0, 0, 0});
+    } else {
+        DataCopyPad(inLocal, inputXGm_[srcIdx * cols_ + j * perLoopCols_], intriParamsT, {false, 0, 0, 0});
+    }
+
+    inputXInQueue_.EnQue<float>(inLocal);
+    inLocal = inputXInQueue_.DeQue<float>();
+
+    if (isInputScale_) {
+        DataCopyPad(smoothLocal, quantSmoothGm_[expertIdx * cols_ + j * perLoopCols_], intriParamsFp32,
+                    {false, 0, 0, 0});
+        smoothInQueue_.EnQue(smoothLocal);
+        smoothLocal = smoothInQueue_.DeQue<float>();
+    }
+
+    if constexpr (!IsSameType<T, float>::value) {
+        Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign_], RoundMode::CAST_NONE, colsTileLength_);
+        PipeBarrier<PIPE_V>();
+    }
+
+    if (isInputScale_) {
+        Mul(inLocal, inLocal, smoothLocal, colsTileLength_);
+        PipeBarrier<PIPE_V>();
+    }
+
+    Abs(tempLocal, inLocal, colsTileLength_);
+    PipeBarrier<PIPE_V>();
+
+    ReduceMax(scaleLocal[INT32_ONE_BLOCK_NUM], tempLocal, tempLocal, colsTileLength_);
+
+    DataCopyPad(quantTempGm_[j * perLoopCols_], inLocal, intriParamsFp32);
+    smoothInQueue_.FreeTensor(smoothLocal);
+    SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+    return scaleLocal.GetValue(INT32_ONE_BLOCK_NUM);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void
+MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::ComputeScale(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
+                                                       float scaleTemp, int64_t dstIndex, int64_t j)
+{
+    DataCopyExtParams copyInParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(int8_t)), 0, 0, 0};
+
+    LocalTensor<int8_t> outLocal = inputXOutQueue_.AllocTensor<int8_t>();
+
+    DataCopyPad(inLocal, quantTempGm_[j * perLoopCols_], copyInParams, {false, 0, 0, 0});
+    inputXInQueue_.EnQue<float>(inLocal);
+    inLocal = inputXInQueue_.DeQue<float>();
+
+    Duplicate<float>(tempLocal, scaleTemp, colsTileLength_);
+    PipeBarrier<PIPE_V>();
+
+    Div(tempLocal, inLocal, tempLocal, colsTileLength_);
+    PipeBarrier<PIPE_V>();
+
+    Cast(tempLocal.ReinterpretCast<half>(), tempLocal, RoundMode::CAST_TRUNC, colsTileLength_);
+    PipeBarrier<PIPE_V>();
+
+    Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_ROUND, colsTileLength_);
+
+    inputXOutQueue_.EnQue(outLocal);
+    outLocal = inputXOutQueue_.DeQue<int8_t>();
+    DataCopyPad(expandedXGm_[dstIndex * cols_ + j * perLoopCols_], outLocal, copyOutParams);
+
+    inputXOutQueue_.FreeTensor(outLocal);
+    SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void
+MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXPartialDynamicQuantFromScatter(int64_t progress)
+{
+    LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
+    for (int64_t i = 0; i < currentLoopRows_; i++) {
+        int64_t rowOffset = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
+        if (dropPadMode_ == DROPLESS_MODE && (rowOffset + i) >= activeNum_) {
+            break;
+        }
+        int32_t srcIdx = indicesLocal.GetValue(i);
+        int32_t expertIdx = indicesLocal.GetValue(currentLoopRowsAlign_ + i) - expertStart_;
+        LocalTensor<float> inLocal = inputXInQueue_.AllocTensor<float>();
+        LocalTensor<float> tempLocal = calcQueue_.AllocTensor<float>();
+        LocalTensor<float> scaleLocal = scaleOutQueue_.AllocTensor<float>();
+
+        float tileMax;
+        float reduceMax = *((float *)&INF);
+        for (int64_t j = 0; j < colLoops_; j++) {
+            colsTileLength_ = perLoopCols_;
+            if (j == colLoops_ - 1) {
+                colsTileLength_ = lastLoopCols_;
+            }
+
+            if (smoothType_ == SCALE_1H) {
+                // 1H
+                tileMax = ComputeMax(inLocal, tempLocal, scaleLocal, srcIdx / k_, 0, j);
+            } else {
+                // EH
+                tileMax = ComputeMax(inLocal, tempLocal, scaleLocal, srcIdx / k_, expertIdx, j);
+            }
+            reduceMax = (reduceMax > tileMax) ? reduceMax : tileMax;
+        }
+
+        float scaleTemp = reduceMax / MAX_INT8;
+        Duplicate<float>(scaleLocal, scaleTemp, INT32_ONE_BLOCK_NUM);
+        scaleOutQueue_.EnQue(scaleLocal);
+        scaleLocal = scaleOutQueue_.DeQue<float>();
+
+        DataCopyPad(expandedScaleGm_[(rowOffset + i)], scaleLocal, {1, 4, 0, 0, 0});
+
+        for (int64_t j = 0; j < colLoops_; j++) {
+            colsTileLength_ = perLoopCols_;
+            if (j == colLoops_ - 1) {
+                colsTileLength_ = lastLoopCols_;
+            }
+            ComputeScale(inLocal, tempLocal, scaleTemp, rowOffset + i, j);
+        }
+        inputXInQueue_.FreeTensor(inLocal);
+        calcQueue_.FreeTensor(tempLocal);
+        scaleOutQueue_.FreeTensor(scaleLocal);
+    }
+    expandRowIdxInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::CopyOutXPartialDynamicQuantFromGather(int64_t progress)
+{
+    LocalTensor<int32_t> indicesLocal = expandRowIdxInQueue_.DeQue<int32_t>();
+    int64_t rowOffset = blockIdx_ * perCoreRow_ + progress * perLoopRows_;
+    int64_t startXRow = rowOffset / k_;
+    int64_t endXRow = (rowOffset + currentLoopRows_ - 1) / k_;
+    int64_t curIndex = 0;
+
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    for (int64_t row = startXRow; row <= endXRow; row++) {
+        LocalTensor<float> inLocal = inputXInQueue_.AllocTensor<float>();
+        LocalTensor<float> tempLocal = calcQueue_.AllocTensor<float>();
+        LocalTensor<float> quantScaleLocal = scaleOutQueue_.AllocTensor<float>();
+
+        float reduceMax = *((float *)&INF);
+        for (int64_t j = 0; j < colLoops_; j++) {
+            colsTileLength_ = perLoopCols_;
+            if (j == colLoops_ - 1) {
+                colsTileLength_ = lastLoopCols_;
+            }
+
+            float tileMax = ComputeMax(inLocal, tempLocal, quantScaleLocal, row, 0, j);
+            reduceMax = (reduceMax > tileMax) ? reduceMax : tileMax;
+        }
+
+        float scaleTemp = reduceMax / MAX_INT8;
+        Duplicate<float>(quantScaleLocal, scaleTemp, INT32_ONE_BLOCK_NUM);
+        scaleOutQueue_.EnQue(quantScaleLocal);
+        quantScaleLocal = scaleOutQueue_.DeQue<float>();
+
+        while (curIndex < currentLoopRows_ && (curIndex + rowOffset) / k_ == row) {
+            int32_t outIndex = indicesLocal.GetValue(curIndex);
+            curIndex++;
+            if (outIndex == -1 || (dropPadMode_ == DROPLESS_MODE && outIndex >= activeNum_)) {
+                continue;
+            }
+            DataCopyPad(expandedScaleGm_[outIndex], quantScaleLocal, quantScaleParams);
+            for (int64_t j = 0; j < colLoops_; j++) {
+                colsTileLength_ = perLoopCols_;
+                if (j == colLoops_ - 1) {
+                    colsTileLength_ = lastLoopCols_;
+                }
+                ComputeScale(inLocal, tempLocal, scaleTemp, outIndex, j);
+            }
+        }
+        inputXInQueue_.FreeTensor(inLocal);
+        calcQueue_.FreeTensor(tempLocal);
+        scaleOutQueue_.FreeTensor(quantScaleLocal);
+    }
+    expandRowIdxInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void
+MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::Init(GM_ADDR inputX, GM_ADDR quantSmooth, GM_ADDR sortedExpertIdx,
+                                               GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR expandedScale,
+                                               const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    pipe_ = tPipe;
+    blockIdx_ = GetBlockIdx();
+    gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
+    cols_ = tilingData->cols;
+    n_ = tilingData->n;
+    k_ = tilingData->k;
+    totalLength_ = n_ * k_;
+    isInputScale_ = tilingData->isInputScale;
+    expertStart_ = tilingData->expertStart;
+    rowIdxType_ = tilingData->rowIdxType;
+    dropPadMode_ = tilingData->dropPadMode;
+    activeNum_ = tilingData->activeNum;
+    ep_ = tilingData->ep;
+    smoothType_ = tilingData->smoothType;
+    coreNum_ = tilingData->coreNum;
+
+    // core split
+    int64_t actualExpertNum_ = tilingData->actualExpertNum;
+    if (ep_) {
+        expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + Align(n_ * k_, sizeof(int32_t)) * 2 +
+                                                Align(actualExpertNum_, sizeof(int32_t)),
+                                            1);
+        AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
+                                          AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
+        expertTotalCount_ = expertTotalCountGm_.GetValue(0);
+    } else {
+        expertTotalCount_ = totalLength_;
+    }
+
+    perCoreRow_ = Ceil(expertTotalCount_, tilingData->coreNum);
+    needCoreNum_ = Ceil(expertTotalCount_, perCoreRow_);
+    int64_t lastCoreIndicesElements = expertTotalCount_ - (needCoreNum_ - 1) * perCoreRow_;
+
+    // inner core split
+    int64_t originPerLoopElements;
+    if (blockIdx_ == needCoreNum_ - 1) {
+        coreRows_ = lastCoreIndicesElements;
+        originPerLoopElements = gatherOutTilingData_->lastCorePerLoopIndicesElements;
+    } else {
+        coreRows_ = perCoreRow_;
+        originPerLoopElements = gatherOutTilingData_->perCorePerLoopIndicesElements;
+    }
+    perLoopRows_ = Min(coreRows_, originPerLoopElements);
+    rowLoops_ = Ceil(coreRows_, perLoopRows_);
+    lastLoopRows_ = coreRows_ - (rowLoops_ - 1) * perLoopRows_;
+
+    // cols split
+    perLoopCols_ = gatherOutTilingData_->perLoopCols;
+    lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
+    colLoops_ = gatherOutTilingData_->colsLoops;
+
+    perLoopColsAlign_ = Align(perLoopCols_, sizeof(T));
+
+    inputXGm_.SetGlobalBuffer((__gm__ T *)inputX);
+    expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
+
+    expandedExpertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + blockIdx_ * perCoreRow_,
+                                         Align(coreRows_, sizeof(int32_t)));
+
+    if constexpr (COPYOUTTYPE == SCATTER) {
+        if (rowIdxType_ == SCATTER) {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
+                                              Align(perCoreRow_, sizeof(int32_t)));
+        } else {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + Align(n_ * k_, sizeof(int32_t)) +
+                                                  blockIdx_ * perCoreRow_,
+                                              Align(perCoreRow_, sizeof(int32_t)));
+        }
+    } else {
+        if (rowIdxType_ == GATHER) {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
+                                              Align(perCoreRow_, sizeof(int32_t)));
+        } else {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)sortedExpertIdx + Align(n_ * k_, sizeof(int32_t)) +
+                                                  blockIdx_ * perCoreRow_,
+                                              Align(perCoreRow_, sizeof(int32_t)));
+        }
+    }
+
+    if (isInputScale_) {
+        quantSmoothGm_.SetGlobalBuffer((__gm__ float *)quantSmooth);
+    }
+    expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
+
+    if (colLoops_ > 1) {
+        quantTempGm_.SetGlobalBuffer((__gm__ float *)sortedExpertIdx + Align(totalLength_, sizeof(int32_t)) * 2 +
+                                         Align(actualExpertNum_, sizeof(int32_t)) * 2 +
+                                         Align(totalLength_, sizeof(int32_t)) + blockIdx_ * cols_,
+                                     cols_ * sizeof(float));
+    }
+
+    currentLoopRowsAlign_ = Align(perLoopRows_, sizeof(int32_t));
+
+    int64_t perLoopColsAlignBytes = AlignBytes(this->perLoopCols_, sizeof(T));
+    perLoopColsAlignBytes =
+        Max(int64_t(perLoopColsAlignBytes * sizeof(float) / sizeof(T)), int64_t(BLOCK_BYTES + BLOCK_BYTES));
+    pipe_->InitBuffer(expandRowIdxInQueue_, GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM,
+                      2 * AlignBytes(perLoopRows_, sizeof(int32_t)));
+    pipe_->InitBuffer(inputXInQueue_, GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM, perLoopColsAlignBytes); // percols * 2 * 4
+    pipe_->InitBuffer(smoothInQueue_, GATHER_OUT_DYNAMIC_QUANT_BUFFER_NUM,
+                      AlignBytes(perLoopCols_, sizeof(float)));                      // percols * 2 * 4
+    pipe_->InitBuffer(calcQueue_, 1, AlignBytes(perLoopCols_, sizeof(float)));       // percols * 1 * 4
+    pipe_->InitBuffer(inputXOutQueue_, 1, AlignBytes(perLoopCols_, sizeof(int8_t))); // percols * 1
+    pipe_->InitBuffer(scaleOutQueue_, 1, BLOCK_BYTES + BLOCK_BYTES);                 // 32 + 32
+}
+
+template <typename T, const int COPYOUTTYPE>
+__aicore__ inline void MoeGatherOutDynamicQuant<T, COPYOUTTYPE>::Process()
+{
+    if (blockIdx_ < needCoreNum_) {
+        currentLoopRows_ = perLoopRows_;
+        if (colLoops_ > 1) {
+            for (int64_t loop = 0; loop < rowLoops_; loop++) {
+                if (loop == rowLoops_ - 1) {
+                    currentLoopRows_ = lastLoopRows_;
+                }
+                CopyInExpandedExpertIdx(loop);
+                if constexpr (COPYOUTTYPE == GATHER) {
+                    CopyOutXPartialDynamicQuantFromGather(loop);
+                } else {
+                    CopyOutXPartialDynamicQuantFromScatter(loop);
+                }
+            }
+        } else {
+            for (int64_t loop = 0; loop < rowLoops_; loop++) {
+                if (loop == rowLoops_ - 1) {
+                    currentLoopRows_ = lastLoopRows_;
+                }
+                CopyInExpandedExpertIdx(loop);
+                if constexpr (COPYOUTTYPE == GATHER) {
+                    CopyOutXDynamicQuantFromGather(loop);
+                } else {
+                    CopyOutXDynamicQuantFromScatter(loop);
+                }
+            }
+        }
+    }
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_GATHER_DYNAMIC_QUANT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_out.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_out.h
@@ -0,0 +1,321 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_gather_out.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_GATHER_OUT_H
+#define MOE_CUSTOM_GATHER_OUT_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+constexpr int64_t GATHER_OUT_BUFFER_NUM = 2;
+
+template <typename T, const int EP>
+class MoeGatherOut {
+public:
+    __aicore__ inline MoeGatherOut(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR scale, GM_ADDR workspace, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
+                                GM_ADDR expandedScale, const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+    __aicore__ inline void CopyExpertIn(int64_t progress);
+    __aicore__ inline void CopyXIn(int64_t xSrcOffset, int64_t curLoopCols);
+    __aicore__ inline void CopyXOut(int64_t xDstOffset, int64_t curLoopCols);
+    __aicore__ inline void CopyScaleIn(int64_t scaleSrcOffset);
+    __aicore__ inline void CopyScaleOut(int64_t scaleDstOffset);
+    __aicore__ inline void GatherCopyOut(int64_t progress);
+    __aicore__ inline void ScatterCopyOut(int64_t progress);
+
+private:
+    TPipe *pipe_;
+    TQueBind<TPosition::VECIN, TPosition::VECOUT, GATHER_OUT_BUFFER_NUM> xCopyInQueue_;
+    TQueBind<TPosition::VECIN, TPosition::VECOUT, GATHER_OUT_BUFFER_NUM> scaleCopyInQueue_;
+    TQue<QuePosition::VECIN, GATHER_OUT_BUFFER_NUM> expandedRowIdxCopyInQueue_;
+
+    GlobalTensor<T> xGm_;
+    GlobalTensor<float> xGscaleGm_;
+    GlobalTensor<int32_t> sortedExpertIdxGm_;
+    GlobalTensor<T> expandedXGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<float> expandedScaleGm_;
+    GlobalTensor<int32_t> expertTotalCountGm_;
+
+    int64_t blockIdx_;
+    int64_t cols_;
+    int64_t n_;
+    int64_t k_;
+    int64_t activeNum_;
+    int64_t dropPadMode_;
+
+    int64_t colsLoops_;
+    int64_t perLoopCols_;
+    int64_t lastLoopCols_;
+
+    int64_t indicesLoops_;
+    int64_t curLoopElements_;
+
+    int64_t perCoreIndicesElements_;
+    int64_t lastCoreIndicesElements_;
+    int64_t perCorePerLoopIndicesElements_;
+    int64_t lastCorePerLoopIndicesElements_;
+    int64_t curCorePerLoopIndicesElements_;
+    int64_t curCoreLastLoopIndicesElements_;
+    int64_t needCoreNum_;
+    int64_t curCoreIndicesElements_;
+
+    int64_t actualExpertNum_;
+    int64_t expertTotalCount_;
+
+    int64_t rowIdxType_;
+    int64_t isInputScale_;
+    int64_t coreNum_;
+};
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::Init(GM_ADDR x, GM_ADDR scale, GM_ADDR workspace, GM_ADDR expandedRowIdx,
+                                                 GM_ADDR expandedX, GM_ADDR expandedScale,
+                                                 const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    pipe_ = tPipe;
+    blockIdx_ = GetBlockIdx();
+
+    cols_ = tilingData->cols;
+    n_ = tilingData->n;
+    k_ = tilingData->k;
+    coreNum_ = tilingData->coreNum;
+    dropPadMode_ = tilingData->dropPadMode;
+    activeNum_ = tilingData->activeNum;
+
+    isInputScale_ = tilingData->isInputScale;
+    rowIdxType_ = tilingData->rowIdxType;
+
+    colsLoops_ = tilingData->gatherOutComputeParamsOp.colsLoops;
+    perLoopCols_ = tilingData->gatherOutComputeParamsOp.perLoopCols;
+    lastLoopCols_ = tilingData->gatherOutComputeParamsOp.lastLoopCols;
+
+    actualExpertNum_ = tilingData->actualExpertNum;
+
+    if constexpr (EP) {
+        expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) * 2 +
+                                                Align(actualExpertNum_, sizeof(int32_t)),
+                                            1);
+        AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
+                                          AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
+        expertTotalCount_ = expertTotalCountGm_.GetValue(0);
+    } else {
+        expertTotalCount_ = n_ * k_;
+    }
+
+    perCorePerLoopIndicesElements_ = tilingData->gatherOutComputeParamsOp.perCorePerLoopIndicesElements;
+    lastCorePerLoopIndicesElements_ = tilingData->gatherOutComputeParamsOp.lastCorePerLoopIndicesElements;
+    perCoreIndicesElements_ = Ceil(expertTotalCount_, tilingData->coreNum);
+    needCoreNum_ = Ceil(expertTotalCount_, perCoreIndicesElements_);
+    lastCoreIndicesElements_ = expertTotalCount_ - (needCoreNum_ - 1) * perCoreIndicesElements_;
+
+    if (blockIdx_ == needCoreNum_ - 1) {
+        curCoreIndicesElements_ = lastCoreIndicesElements_;
+        curCorePerLoopIndicesElements_ = Min(lastCorePerLoopIndicesElements_, curCoreIndicesElements_);
+    } else {
+        curCoreIndicesElements_ = perCoreIndicesElements_;
+        curCorePerLoopIndicesElements_ = Min(perCorePerLoopIndicesElements_, curCoreIndicesElements_);
+    }
+    indicesLoops_ = Ceil(curCoreIndicesElements_, curCorePerLoopIndicesElements_);
+    curCoreLastLoopIndicesElements_ = curCoreIndicesElements_ - (indicesLoops_ - 1) * curCorePerLoopIndicesElements_;
+
+    xGm_.SetGlobalBuffer((__gm__ T *)x, n_ * cols_);
+    xGscaleGm_.SetGlobalBuffer((__gm__ float *)scale, n_);
+
+    expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX);
+    expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
+
+    pipe_->InitBuffer(expandedRowIdxCopyInQueue_, GATHER_OUT_BUFFER_NUM,
+                      AlignBytes(curCorePerLoopIndicesElements_, sizeof(int32_t)));
+    pipe_->InitBuffer(xCopyInQueue_, GATHER_OUT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
+    pipe_->InitBuffer(scaleCopyInQueue_, GATHER_OUT_BUFFER_NUM, AlignBytes(1, sizeof(float)));
+
+    sortedExpertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + blockIdx_ * perCoreIndicesElements_,
+                                       Align(curCoreIndicesElements_, sizeof(int32_t)));
+
+    if constexpr (EP) {
+        if (rowIdxType_ == SCATTER) {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreIndicesElements_,
+                                              Align(curCoreIndicesElements_, sizeof(int32_t)));
+        } else {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
+                                                  blockIdx_ * perCoreIndicesElements_,
+                                              Align(curCoreIndicesElements_, sizeof(int32_t)));
+        }
+    } else {
+        if (rowIdxType_ == GATHER) {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreIndicesElements_,
+                                              Align(curCoreIndicesElements_, sizeof(int32_t)));
+        } else {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
+                                                  blockIdx_ * perCoreIndicesElements_,
+                                              Align(curCoreIndicesElements_, sizeof(int32_t)));
+        }
+    }
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::CopyExpertIn(int64_t progress)
+{
+    LocalTensor<int32_t> subRowIdxLocal = expandedRowIdxCopyInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams copyParams{1, static_cast<uint32_t>(curLoopElements_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> padParams{false, 0, 0, 0};
+    DataCopyPad(subRowIdxLocal, expandedRowIdxGm_[progress * curCorePerLoopIndicesElements_], copyParams, padParams);
+    expandedRowIdxCopyInQueue_.EnQue(subRowIdxLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::CopyXIn(int64_t xSrcOffset, int64_t curLoopCols)
+{
+    LocalTensor<T> xLocal = xCopyInQueue_.AllocTensor<T>();
+    DataCopyExtParams copyParams0{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
+    DataCopyPadExtParams<T> padParams0{false, 0, 0, 0};
+    DataCopyPad(xLocal, xGm_[xSrcOffset], copyParams0, padParams0);
+    xCopyInQueue_.EnQue(xLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::CopyXOut(int64_t xDstOffset, int64_t curLoopCols)
+{
+    LocalTensor<T> xLocal = xCopyInQueue_.DeQue<T>();
+    DataCopyExtParams copyParams2{1, static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
+    DataCopyPad(expandedXGm_[xDstOffset], xLocal, copyParams2);
+    xCopyInQueue_.FreeTensor(xLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::CopyScaleIn(int64_t scaleSrcOffset)
+{
+    LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
+    DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(1 * sizeof(float)), 0, 0, 0};
+    DataCopyPadExtParams<float> padParams1{false, 0, 0, 0};
+    DataCopyPad(scaleLocal, xGscaleGm_[scaleSrcOffset], copyParams1, padParams1);
+    scaleCopyInQueue_.EnQue(scaleLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::CopyScaleOut(int64_t scaleDstOffset)
+{
+    LocalTensor<float> scaleLocal = scaleCopyInQueue_.DeQue<float>();
+    DataCopyExtParams copyParams3{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+    DataCopyPad(expandedScaleGm_[scaleDstOffset], scaleLocal, copyParams3);
+    scaleCopyInQueue_.FreeTensor(scaleLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::GatherCopyOut(int64_t progress)
+{
+    LocalTensor<int32_t> subRowIdxLocal = expandedRowIdxCopyInQueue_.DeQue<int32_t>();
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    int64_t curLoopCols = perLoopCols_;
+    for (int64_t colsLoop = 0; colsLoop < colsLoops_; colsLoop++) {
+        int64_t initialRow = blockIdx_ * perCoreIndicesElements_ + curCorePerLoopIndicesElements_ * progress;
+        int64_t curLoopRow = 0;
+        if (colsLoop == colsLoops_ - 1) {
+            curLoopCols = lastLoopCols_;
+        }
+        int64_t currentLoopStartRow = initialRow / k_;
+        int64_t currentLoopLastRow = (initialRow + this->curLoopElements_ - 1) / k_;
+        for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
+            LocalTensor<T> inLocal = xCopyInQueue_.AllocTensor<T>();
+            int64_t inputOffset = row * cols_ + colsLoop * perLoopCols_;
+            DataCopyExtParams xCopyParams{1, static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
+            DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
+            DataCopyPad(inLocal, xGm_[inputOffset], xCopyParams, dataCopyPadParams);
+            // copy in scale
+            LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
+            DataCopyExtParams scaleCopyParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+            if (isInputScale_ == 1 && colsLoop == 0) {
+                DataCopyPadExtParams<float> scalePadParams{false, 0, 0, 0};
+                DataCopyPad(scaleLocal, xGscaleGm_[row], scaleCopyParams, scalePadParams);
+            }
+            SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+            DataCopyExtParams intriParams{1, static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
+            while (curLoopRow < this->curLoopElements_ && initialRow / k_ == row) {
+                int32_t outIndex = subRowIdxLocal.GetValue(curLoopRow);
+                curLoopRow++;
+                initialRow++;
+                if (outIndex == -1 || (dropPadMode_ == DROPLESS_MODE && outIndex >= activeNum_)) {
+                    continue;
+                }
+                int64_t outOffset = outIndex * this->cols_ + colsLoop * this->perLoopCols_;
+                DataCopyPad(expandedXGm_[outOffset], inLocal, intriParams);
+                // copy out scale
+                if (isInputScale_ == 1 && colsLoop == 0) {
+                    DataCopyPad(expandedScaleGm_[outIndex], scaleLocal, scaleCopyParams);
+                }
+            }
+            scaleCopyInQueue_.FreeTensor(scaleLocal);
+            xCopyInQueue_.FreeTensor(inLocal);
+        }
+    }
+    expandedRowIdxCopyInQueue_.FreeTensor(subRowIdxLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::ScatterCopyOut(int64_t progress)
+{
+    int64_t curExpertLoopOffset = progress * curCorePerLoopIndicesElements_;
+    LocalTensor<int32_t> subRowIdxLocal = expandedRowIdxCopyInQueue_.DeQue<int32_t>();
+    for (int64_t indicesIndex = 0; indicesIndex < curLoopElements_; indicesIndex++) {
+        int64_t rowIdx = subRowIdxLocal.GetValue(indicesIndex);
+        int64_t rowOffset = curExpertLoopOffset + indicesIndex + blockIdx_ * perCoreIndicesElements_;
+        if (activeNum_ > 0 && dropPadMode_ == DROPLESS_MODE && rowOffset >= activeNum_) {
+            break;
+        }
+        SetWaitFlag<HardEvent::S_MTE2>(HardEvent::S_MTE2);
+        if (isInputScale_ == 1) {
+            int64_t scaleSrcOffset = rowIdx / k_;
+            CopyScaleIn(scaleSrcOffset);
+            CopyScaleOut(indicesIndex + curExpertLoopOffset + blockIdx_ * perCoreIndicesElements_);
+        }
+        int64_t curLoopCols = perLoopCols_;
+        for (int64_t colsLoop = 0; colsLoop < colsLoops_; colsLoop++) {
+            if (colsLoop == colsLoops_ - 1) {
+                curLoopCols = lastLoopCols_;
+            }
+            int64_t xSrcOffset = rowIdx / k_ * cols_;
+            int64_t xDstOffset = (blockIdx_ * perCoreIndicesElements_ + curExpertLoopOffset + indicesIndex) * cols_;
+            int64_t colsLoopOffset = colsLoop * perLoopCols_;
+            CopyXIn(xSrcOffset + colsLoopOffset, curLoopCols);
+            CopyXOut(xDstOffset + colsLoopOffset, curLoopCols);
+        }
+    }
+    expandedRowIdxCopyInQueue_.FreeTensor(subRowIdxLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOut<T, EP>::Process()
+{
+    if (blockIdx_ < needCoreNum_) {
+        curLoopElements_ = curCorePerLoopIndicesElements_;
+        for (int64_t loop = 0; loop < indicesLoops_; loop++) {
+            if (loop == indicesLoops_ - 1) {
+                curLoopElements_ = curCoreLastLoopIndicesElements_;
+            }
+            CopyExpertIn(loop);
+            if constexpr (!EP) {
+                GatherCopyOut(loop);
+            } else {
+                ScatterCopyOut(loop);
+            }
+        }
+    }
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_GATHER_OUT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_out_droppad.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_out_droppad.h
@@ -0,0 +1,210 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_gather_out_droppad.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_GATHER_OUT_DROPPAD_H
+#define MOE_CUSTOM_GATHER_OUT_DROPPAD_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+constexpr int64_t GATHER_OUT_DROPPAD_BUFFER_NUM = 2;
+
+template <typename T>
+class MoeGatherOutDroppad {
+public:
+    __aicore__ inline MoeGatherOutDroppad(){};
+    __aicore__ inline void Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
+                                GM_ADDR expandedScale, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
+                                TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyInIndices(int64_t progress);
+    __aicore__ inline void CopyOut(int64_t progress);
+    __aicore__ inline void CopyScaleIn(int64_t scaleSrcOffset, LocalTensor<float> scaleLocal);
+    __aicore__ inline void CopyScaleOut(int64_t scaleDstOffset, LocalTensor<float> scaleLocal);
+
+private:
+    TPipe *pipe_;
+    TQueBind<QuePosition::VECIN, QuePosition::VECOUT, GATHER_OUT_DROPPAD_BUFFER_NUM> xCopyInQueue_;
+    TQueBind<TPosition::VECIN, TPosition::VECOUT, GATHER_OUT_DROPPAD_BUFFER_NUM> scaleCopyInQueue_;
+    TQue<QuePosition::VECIN, GATHER_OUT_DROPPAD_BUFFER_NUM> expandedRowIdxCopyInQueue_;
+
+    GlobalTensor<T> inputXGm_;
+    GlobalTensor<float> xGscaleGm_;
+    GlobalTensor<T> expandedXGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<float> expandedScaleGm_;
+
+    const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
+
+    int64_t needCoreNum_;
+    int64_t blockIdx_;
+    int64_t cols_;
+    int64_t n_;
+    int64_t k_;
+    int64_t currentLoopRows_;
+    int64_t coreRows_;
+    int64_t perLoopRows_;
+    int64_t lastLoopRows_;
+    int64_t rowLoops_;
+    int64_t colsTileLength_;
+    int64_t perLoopCols_;
+    int64_t lastLoopCols_;
+    int64_t colLoops_;
+    int64_t isInputScale_;
+
+    int64_t indicesOffset_;
+    int64_t inputOffset_;
+    int64_t outOffset_;
+};
+
+template <typename T>
+__aicore__ inline void MoeGatherOutDroppad<T>::CopyInIndices(int64_t progress)
+{
+    indicesOffset_ = progress * perLoopRows_;
+    LocalTensor<int32_t> indicesLocal = expandedRowIdxCopyInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
+    expandedRowIdxCopyInQueue_.EnQue<int32_t>(indicesLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherOutDroppad<T>::CopyScaleIn(int64_t scaleSrcOffset, LocalTensor<float> scaleLocal)
+{
+    DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(1 * sizeof(float)), 0, 0, 0};
+    DataCopyPadExtParams<float> padParams1{false, 0, 0, 0};
+    DataCopyPad(scaleLocal, xGscaleGm_[scaleSrcOffset], copyParams1, padParams1);
+    scaleCopyInQueue_.EnQue(scaleLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherOutDroppad<T>::CopyScaleOut(int64_t scaleDstOffset, LocalTensor<float> scaleLocal)
+{
+    DataCopyExtParams copyParams3{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+    DataCopyPad(expandedScaleGm_[scaleDstOffset], scaleLocal, copyParams3);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherOutDroppad<T>::CopyOut(int64_t progress)
+{
+    LocalTensor<int32_t> indicesLocal = expandedRowIdxCopyInQueue_.DeQue<int32_t>();
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    colsTileLength_ = perLoopCols_;
+    for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
+        int64_t initialRow = gatherOutTilingData_->perCoreIndicesElements * blockIdx_ + perLoopRows_ * progress;
+        int64_t curLoopRow = 0;
+        if (colsLoop == colLoops_ - 1) {
+            colsTileLength_ = lastLoopCols_;
+        }
+        int64_t currentLoopStartRow = initialRow / k_;
+        int64_t currentLoopLastRow = (initialRow + currentLoopRows_ - 1) / k_;
+        for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
+            LocalTensor<float> scaleLocal = scaleCopyInQueue_.AllocTensor<float>();
+            if (isInputScale_ == 1) {
+                CopyScaleIn(row, scaleLocal);
+                LocalTensor<float> scaleLocal = scaleCopyInQueue_.DeQue<float>();
+            }
+            inputOffset_ = row * cols_ + colsLoop * perLoopCols_;
+            // input row position
+            LocalTensor<T> inLocal = xCopyInQueue_.AllocTensor<T>();
+            DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(T)), 0, 0, 0};
+            DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
+            DataCopyPad(inLocal, inputXGm_[inputOffset_], dataCopyParams, dataCopyPadParams);
+            SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+            DataCopyExtParams intriParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(T)), 0, 0, 0};
+            while (curLoopRow < currentLoopRows_ && initialRow / k_ == row) {
+                int32_t outIndex = indicesLocal.GetValue(curLoopRow);
+                curLoopRow++;
+                initialRow++;
+                if (outIndex == -1) {
+                    continue;
+                }
+                outOffset_ = outIndex * cols_ + colsLoop * perLoopCols_;
+                DataCopyPad(expandedXGm_[outOffset_], inLocal, intriParams);
+                if (isInputScale_ == 1) {
+                    CopyScaleOut(outIndex, scaleLocal);
+                }
+            }
+            xCopyInQueue_.FreeTensor(inLocal);
+            scaleCopyInQueue_.FreeTensor(scaleLocal);
+        }
+    }
+    expandedRowIdxCopyInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherOutDroppad<T>::Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR expandedRowIdx,
+                                                    GM_ADDR expandedX, GM_ADDR expandedScale, GM_ADDR workspace,
+                                                    const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    pipe_ = tPipe;
+    blockIdx_ = GetBlockIdx();
+    gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
+
+    needCoreNum_ = gatherOutTilingData_->needCoreNum;
+    cols_ = tilingData->cols;
+    n_ = tilingData->n;
+    k_ = tilingData->k;
+    isInputScale_ = tilingData->isInputScale;
+
+    if (blockIdx_ == needCoreNum_ - 1) {
+        coreRows_ = gatherOutTilingData_->lastCoreIndicesElements;
+        perLoopRows_ = gatherOutTilingData_->lastCorePerLoopIndicesElements;
+        lastLoopRows_ = gatherOutTilingData_->lastCoreLastLoopIndicesElements;
+        rowLoops_ = gatherOutTilingData_->lastCoreIndicesLoops;
+    } else {
+        coreRows_ = gatherOutTilingData_->perCoreIndicesElements;
+        perLoopRows_ = gatherOutTilingData_->perCorePerLoopIndicesElements;
+        lastLoopRows_ = gatherOutTilingData_->perCoreLastLoopIndicesElements;
+        rowLoops_ = gatherOutTilingData_->perCoreIndicesLoops;
+    }
+    perLoopCols_ = gatherOutTilingData_->perLoopCols;
+    lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
+    colLoops_ = gatherOutTilingData_->colsLoops;
+
+    inputXGm_.SetGlobalBuffer((__gm__ T *)inputX, coreRows_ * cols_);
+    xGscaleGm_.SetGlobalBuffer((__gm__ float *)scale, n_);
+    expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX, n_ * k_ * cols_);
+    expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx +
+                                          blockIdx_ * gatherOutTilingData_->perCoreIndicesElements,
+                                      Align(coreRows_, sizeof(int32_t)));
+    expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
+
+    pipe_->InitBuffer(xCopyInQueue_, GATHER_OUT_DROPPAD_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
+    pipe_->InitBuffer(expandedRowIdxCopyInQueue_, GATHER_OUT_DROPPAD_BUFFER_NUM,
+                      AlignBytes(perLoopRows_, sizeof(int32_t)));
+    pipe_->InitBuffer(scaleCopyInQueue_, GATHER_OUT_DROPPAD_BUFFER_NUM, AlignBytes(1, sizeof(float)));
+}
+
+template <typename T>
+__aicore__ inline void MoeGatherOutDroppad<T>::Process()
+{
+    if (blockIdx_ < needCoreNum_) {
+        currentLoopRows_ = perLoopRows_;
+        for (int64_t loop = 0; loop < rowLoops_; loop++) {
+            if (loop == rowLoops_ - 1) {
+                currentLoopRows_ = lastLoopRows_;
+            }
+            CopyInIndices(loop);
+            CopyOut(loop);
+        }
+    }
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_GATHER_OUT_DROPPAD_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_sort_multi_core.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_sort_multi_core.h
@@ -0,0 +1,242 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_gather_sort_multi_core.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_GATHER_SORT_MULTI_CORE_H
+#define MOE_CUSTOM_GATHER_SORT_MULTI_CORE_H
+
+#include "moe_custom_common.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+constexpr int64_t SORT32_ALIGN_ELEMENT = 32;
+constexpr int64_t PARALLEL_GATHERED_SORT_NEED_CORE_NUM = 16;
+constexpr int64_t MULTI_GATHERED_MAX_NUM = 4096; // 8192 * 8 / 16
+
+class MoeGatherSortMultiCore {
+public:
+    __aicore__ inline MoeGatherSortMultiCore(){};
+    __aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void Compute();
+    __aicore__ inline void CopyOut();
+
+private:
+    TPipe *pipe_;
+    TBuf<TPosition::VECCALC> buffer_;
+    GlobalTensor<int32_t> workspaceGm_;
+    GlobalTensor<int32_t> expendedRowIdxGm_;
+    GlobalTensor<int32_t> expertIdxGm_;
+    GlobalTensor<float> sortedExpertIdxGm_;
+    GlobalTensor<int32_t> sortedExpertIndexGm_;
+    GlobalTensor<int32_t> sortedNumGm_;
+
+    TQue<QuePosition::VECOUT, 1> sortedNumCopyOutQueue_;
+
+    int64_t expertIdxOffset_ = 0;
+    int64_t expertIndexOffset_ = 0;
+    int64_t compareScalarMask0Offset_ = 0;
+    int64_t compareScalarMask1Offset_ = 0;
+    int64_t gatherMaskOffset_ = 0;
+
+    int64_t totalLength_;
+    int64_t expertStart_ = 0;
+    int64_t expertEnd_ = 0;
+    int64_t actual_expert_num_ = 0;
+    int64_t needCoreNum_ = 0;
+    int64_t perCoreElements_ = 0;
+    int64_t blockIdx_;
+    int64_t currentCoreElements_ = 0;
+    int64_t needSortNum_ = 0;
+    int64_t kvFactor = 2;
+
+    static constexpr int64_t DST_BLK_STRIDE = 1;
+    static constexpr int64_t DST_REP_STRIDE = 8;
+    static constexpr int64_t MASK_STRIDE = 64;
+};
+
+__aicore__ inline void MoeGatherSortMultiCore::CopyIn()
+{
+    LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
+
+    DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
+                                     static_cast<uint32_t>(currentCoreElements_ * sizeof(int32_t)), 0, 0, 0};
+
+    DataCopyPad(expertIdx, expertIdxGm_[blockIdx_ * perCoreElements_], dataCopyParams, dataCopyPadParams);
+    SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
+}
+
+__aicore__ inline void MoeGatherSortMultiCore::Compute()
+{
+    LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
+    LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
+    LocalTensor<int32_t> gatheredExpertIdx = buffer_.Get<int32_t>();
+    LocalTensor<float> gatheredExpertIdxFp32 = gatheredExpertIdx.ReinterpretCast<float>();
+
+    Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, currentCoreElements_);
+    PipeBarrier<PIPE_V>();
+    Muls(expertIdxFp32, expertIdxFp32, (float)-1, currentCoreElements_);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<uint8_t> compareScalarMaskLocalTensor0 = buffer_.Get<uint8_t>()[compareScalarMask0Offset_];
+    LocalTensor<uint8_t> compareScalarMaskLocalTensor1 = buffer_.Get<uint8_t>()[compareScalarMask1Offset_];
+    LocalTensor<uint8_t> gatherMaskLocalTensor = buffer_.Get<uint8_t>()[gatherMaskOffset_];
+
+    // Find elements >= expertStart_, which means -elements <= -expertStart_
+    AscendC::CompareScalar(
+        compareScalarMaskLocalTensor0, expertIdxFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::LE,
+        (currentCoreElements_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+    PipeBarrier<PIPE_V>();
+
+    // Find elements < expertEnd_, which means -elements > -expertEnd_
+    AscendC::CompareScalar(
+        compareScalarMaskLocalTensor1, expertIdxFp32, static_cast<float>(-expertEnd_), AscendC::CMPMODE::GT,
+        (currentCoreElements_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+    PipeBarrier<PIPE_V>();
+
+    // Get experts between [expert_start, expert_end)
+    And(gatherMaskLocalTensor.ReinterpretCast<uint16_t>(), compareScalarMaskLocalTensor0.ReinterpretCast<uint16_t>(),
+        compareScalarMaskLocalTensor1.ReinterpretCast<uint16_t>(),
+        Ceil(currentCoreElements_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE / kvFactor);
+    PipeBarrier<PIPE_V>();
+
+    uint64_t sortedNum = 0;
+    GatherMaskParams gatherMaskParams;
+    gatherMaskParams.repeatTimes = 1;
+    gatherMaskParams.src0BlockStride = 1;
+    gatherMaskParams.src0RepeatStride = DST_REP_STRIDE;
+    gatherMaskParams.src1RepeatStride = DST_REP_STRIDE;
+    GatherMask(gatheredExpertIdxFp32, expertIdxFp32, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
+               static_cast<uint32_t>(currentCoreElements_), gatherMaskParams, sortedNum);
+    PipeBarrier<PIPE_V>();
+    actual_expert_num_ = sortedNum;
+    int64_t needSortNum = Ceil(static_cast<int64_t>(sortedNum), ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+    needSortNum_ = needSortNum;
+
+    // Handle actual_expert_num_ == 0
+    if (actual_expert_num_ < 1) {
+        return;
+    }
+
+    LocalTensor<int32_t> expertIndex = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
+    LocalTensor<int32_t> gatheredExpertIndex = buffer_.Get<int32_t>()[needSortNum];
+    ArithProgression<int32_t>(expertIndex, blockIdx_ * perCoreElements_, 1, currentCoreElements_);
+    GatherMask(gatheredExpertIndex, expertIndex, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
+               static_cast<uint32_t>(currentCoreElements_), gatherMaskParams, sortedNum);
+    PipeBarrier<PIPE_V>();
+    int64_t duplicateNum = sortedNum % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = sortedNum - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(gatheredExpertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+    }
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<float> concatLocal;
+    LocalTensor<float> sortTempTensor = buffer_.Get<float>()[needSortNum * kvFactor];
+    Concat(concatLocal, gatheredExpertIdxFp32, sortTempTensor, needSortNum / ONE_REPEAT_SORT_NUM);
+    LocalTensor<float> sortedLocal = buffer_.Get<float>()[needSortNum * kvFactor + needSortNum * kvFactor * kvFactor];
+    Sort<float, true>(sortedLocal, concatLocal, gatheredExpertIndex.ReinterpretCast<uint32_t>(), sortTempTensor,
+                      needSortNum / ONE_REPEAT_SORT_NUM);
+    SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
+}
+
+__aicore__ inline void MoeGatherSortMultiCore::CopyOut()
+{
+    // Copy out sortedLocal for MergeSort
+    if (actual_expert_num_ > 0) {
+        LocalTensor<float> sortedLocal =
+            buffer_.Get<float>()[needSortNum_ * kvFactor + needSortNum_ * kvFactor * kvFactor];
+        DataCopyExtParams extParams{static_cast<uint16_t>(1),
+                                    static_cast<uint32_t>(2 * actual_expert_num_ * sizeof(float)), 0, 0, 0};
+        int64_t curCoreStartIndex = 2 * GetBlockIdx() * perCoreElements_;
+        DataCopyPad(sortedExpertIdxGm_[curCoreStartIndex], sortedLocal, extParams);
+    }
+
+    // Copyout actual_expert_num_
+    LocalTensor<int32_t> sortedNumOutLocal = sortedNumCopyOutQueue_.AllocTensor<int32_t>();
+    sortedNumOutLocal.SetValue(0, actual_expert_num_);
+    SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+    DataCopyExtParams copyParams3{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(uint32_t)), 0, 0, 0};
+    DataCopyPad(sortedNumGm_[GetBlockIdx()], sortedNumOutLocal, copyParams3);
+
+    sortedNumCopyOutQueue_.FreeTensor(sortedNumOutLocal);
+}
+
+__aicore__ inline void MoeGatherSortMultiCore::Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
+                                                    const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    pipe_ = tPipe;
+    blockIdx_ = GetBlockIdx();
+    totalLength_ = tilingData->n * tilingData->k;
+
+    expertStart_ = tilingData->expertStart;
+    expertEnd_ = tilingData->expertEnd;
+
+    expertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expertIdx);
+
+    expendedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx);
+
+    workspaceGm_.SetGlobalBuffer((__gm__ int32_t *)workspace);
+
+    sortedExpertIdxGm_.SetGlobalBuffer((__gm__ float *)workspace);
+    sortedExpertIndexGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(totalLength_, sizeof(int32_t)));
+
+    // key and value
+    sortedNumGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                 Align(totalLength_, sizeof(int32_t)) * kvFactor * kvFactor);
+
+    needCoreNum_ = PARALLEL_GATHERED_SORT_NEED_CORE_NUM;
+    perCoreElements_ = Ceil(totalLength_, needCoreNum_);
+
+    int32_t lastCoreElements = totalLength_ - (needCoreNum_ - 1) * perCoreElements_;
+    if (blockIdx_ == (needCoreNum_ - 1)) {
+        currentCoreElements_ = lastCoreElements;
+    } else {
+        currentCoreElements_ = perCoreElements_;
+    }
+
+    // expertIdxOffset_
+    expertIdxOffset_ = AlignBytes(currentCoreElements_, sizeof(int32_t));
+    expertIndexOffset_ = expertIdxOffset_;
+
+    gatherMaskOffset_ = expertIdxOffset_ * kvFactor;
+    int64_t maskOffset =
+        AlignBytes(Ceil(currentCoreElements_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE, sizeof(int8_t));
+    compareScalarMask0Offset_ = gatherMaskOffset_ + maskOffset;
+    compareScalarMask1Offset_ = compareScalarMask0Offset_ + maskOffset;
+    int64_t bufferSize = MULTI_GATHERED_MAX_NUM * kvFactor * kvFactor * kvFactor * sizeof(int32_t);
+    pipe_->InitBuffer(sortedNumCopyOutQueue_, 1, AlignBytes(1, sizeof(int32_t)));
+    pipe_->InitBuffer(buffer_, bufferSize); // 73728 Bytes
+}
+
+__aicore__ inline void MoeGatherSortMultiCore::Process()
+{
+    if (blockIdx_ < PARALLEL_GATHERED_SORT_NEED_CORE_NUM) {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+    SyncAll();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_GATHER_SORT_MULTI_CORE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_static_quant.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_gather_static_quant.h
@@ -0,0 +1,329 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_gather_quant.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_GATHER_STATIC_QUANT_H
+#define MOE_CUSTOM_GATHER_STATIC_QUANT_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+constexpr int64_t GATHER_OUT_QUANT_BUFFER_NUM = 2;
+
+template <typename T, const int EP>
+class MoeGatherOutQuant {
+public:
+    __aicore__ inline MoeGatherOutQuant(){};
+    __aicore__ inline void Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset, GM_ADDR expandedRowIdx,
+                                GM_ADDR expandedX, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
+                                TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyExpertIn(int64_t progress);
+    __aicore__ inline void Compute(int64_t curLoopCols);
+    __aicore__ inline void CopyXIn(int64_t xSrcOffset, int64_t curLoopCols);
+    __aicore__ inline void CopyXOut(int64_t xDstOffset, int64_t curLoopCols);
+    __aicore__ inline void ScatterCopyOut(int64_t progress);
+    __aicore__ inline void GatherCopyOut(int64_t progress);
+
+private:
+    TPipe *pipe_;
+    TQue<QuePosition::VECIN, GATHER_OUT_QUANT_BUFFER_NUM> inputXCopyInQueue_;
+    TQue<QuePosition::VECIN, GATHER_OUT_QUANT_BUFFER_NUM> expandRowIdxCopyInQueue_;
+    TQue<QuePosition::VECOUT, GATHER_OUT_QUANT_BUFFER_NUM> inputXCopyOutQueue_;
+    TQue<QuePosition::VECOUT, 1> floatQueue_;
+    TQue<QuePosition::VECOUT, 1> halfQueue_;
+
+    GlobalTensor<T> inputXGm_;
+    GlobalTensor<int8_t> expandedXGm_;
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<float> scaleGm_;
+    GlobalTensor<float> offsetGm_;
+    GlobalTensor<int32_t> expertTotalCountGm_;
+
+    const MoeCustomGatherOutComputeTilingData *gatherOutTilingData_;
+
+    int64_t needCoreNum_;
+    int64_t blockIdx_;
+    int64_t cols_;
+    int64_t n_;
+    int64_t k_;
+    int64_t perCoreRow_;
+    int64_t currentLoopRows_;
+    int64_t coreRows_;
+    int64_t perLoopRows_;
+    int64_t lastLoopRows_;
+    int64_t rowLoops_;
+    int64_t colsTileLength_;
+    int64_t perLoopCols_;
+    int64_t lastLoopCols_;
+    int64_t colLoops_;
+    float scale_;
+    float offset_;
+    int64_t rowIdxType_;
+    int64_t dropPadMode_;
+    int64_t activeNum_;
+    int64_t indicesOffset_;
+    int64_t coreNum_;
+    int64_t inputOffset_;
+    int64_t outOffset_;
+    int64_t expertTotalCount_;
+};
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::Init(GM_ADDR inputX, GM_ADDR scale, GM_ADDR offset,
+                                                      GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR workspace,
+                                                      const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    pipe_ = tPipe;
+    blockIdx_ = GetBlockIdx();
+
+    gatherOutTilingData_ = &(tilingData->gatherOutComputeParamsOp);
+    cols_ = tilingData->cols;
+    n_ = tilingData->n;
+    k_ = tilingData->k;
+    rowIdxType_ = tilingData->rowIdxType;
+    dropPadMode_ = tilingData->dropPadMode;
+    activeNum_ = tilingData->activeNum;
+    coreNum_ = tilingData->coreNum;
+
+    // core split
+    int64_t actualExpertNum_ = tilingData->actualExpertNum;
+
+    if constexpr (EP) {
+        expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) * 2 +
+                                                Align(actualExpertNum_, sizeof(int32_t)),
+                                            1);
+        AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
+                                          AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
+        expertTotalCount_ = expertTotalCountGm_.GetValue(0);
+    } else {
+        expertTotalCount_ = n_ * k_;
+    }
+
+    perCoreRow_ = Ceil(expertTotalCount_, tilingData->coreNum);
+    needCoreNum_ = Ceil(expertTotalCount_, perCoreRow_);
+    int64_t lastCoreIndicesElements_ = expertTotalCount_ - (needCoreNum_ - 1) * perCoreRow_;
+
+    // inner core split
+    int64_t originPerLoopElements;
+    if (blockIdx_ == needCoreNum_ - 1) {
+        coreRows_ = lastCoreIndicesElements_;
+        originPerLoopElements = gatherOutTilingData_->lastCorePerLoopIndicesElements;
+    } else {
+        coreRows_ = perCoreRow_;
+        originPerLoopElements = gatherOutTilingData_->perCorePerLoopIndicesElements;
+    }
+    perLoopRows_ = Min(coreRows_, originPerLoopElements);
+    rowLoops_ = Ceil(coreRows_, perLoopRows_);
+    lastLoopRows_ = coreRows_ - (rowLoops_ - 1) * perLoopRows_;
+
+    // cols split
+    perLoopCols_ = gatherOutTilingData_->perLoopCols;
+    lastLoopCols_ = gatherOutTilingData_->lastLoopCols;
+    colLoops_ = gatherOutTilingData_->colsLoops;
+
+    inputXGm_.SetGlobalBuffer((__gm__ T *)inputX);
+    expandedXGm_.SetGlobalBuffer((__gm__ int8_t *)expandedX);
+
+    if constexpr (EP) {
+        if (rowIdxType_ == SCATTER) {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
+                                              Align(coreRows_, sizeof(int32_t)));
+        } else {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
+                                                  blockIdx_ * perCoreRow_,
+                                              Align(coreRows_, sizeof(int32_t)));
+        }
+    } else {
+        if (rowIdxType_ == GATHER) {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreRow_,
+                                              Align(coreRows_, sizeof(int32_t)));
+        } else {
+            expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(n_ * k_, sizeof(int32_t)) +
+                                                  blockIdx_ * perCoreRow_,
+                                              Align(coreRows_, sizeof(int32_t)));
+        }
+    }
+
+
+    scaleGm_.SetGlobalBuffer((__gm__ float *)scale, 1);
+    offsetGm_.SetGlobalBuffer((__gm__ float *)offset, 1);
+    scale_ = scaleGm_.GetValue(0);
+    offset_ = offsetGm_.GetValue(0);
+
+    pipe_->InitBuffer(inputXCopyInQueue_, GATHER_OUT_QUANT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(T)));
+    pipe_->InitBuffer(inputXCopyOutQueue_, GATHER_OUT_QUANT_BUFFER_NUM, AlignBytes(perLoopCols_, sizeof(int8_t)));
+    pipe_->InitBuffer(expandRowIdxCopyInQueue_, GATHER_OUT_QUANT_BUFFER_NUM, AlignBytes(perLoopRows_, sizeof(int32_t)));
+    pipe_->InitBuffer(floatQueue_, 1, AlignBytes(perLoopCols_, sizeof(float)));
+    pipe_->InitBuffer(halfQueue_, 1, AlignBytes(perLoopCols_, sizeof(half)));
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::CopyExpertIn(int64_t progress)
+{
+    indicesOffset_ = progress * perLoopRows_;
+    LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{1, static_cast<uint32_t>(currentLoopRows_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(indicesLocal, expandedRowIdxGm_[indicesOffset_], dataCopyParams, dataCopyPadParams);
+    expandRowIdxCopyInQueue_.EnQue<int32_t>(indicesLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::CopyXIn(int64_t xSrcOffset, int64_t curLoopCols)
+{
+    LocalTensor<T> inLocal = inputXCopyInQueue_.AllocTensor<T>();
+    DataCopyExtParams copyParams0{static_cast<uint16_t>(1), static_cast<uint32_t>(curLoopCols * sizeof(T)), 0, 0, 0};
+    DataCopyPadExtParams<T> padParams0{false, 0, 0, 0};
+    DataCopyPad(inLocal, inputXGm_[xSrcOffset], copyParams0, padParams0);
+    inputXCopyInQueue_.EnQue(inLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::CopyXOut(int64_t xDstOffset, int64_t curLoopCols)
+{
+    LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.DeQue<int8_t>();
+    DataCopyExtParams copyParams2{1, static_cast<uint32_t>(curLoopCols * sizeof(int8_t)), 0, 0, 0};
+    DataCopyPad(expandedXGm_[xDstOffset], outLocal, copyParams2);
+    inputXCopyOutQueue_.FreeTensor(outLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::Compute(int64_t curLoopCols)
+{
+    LocalTensor<float> floatLocal;
+    LocalTensor<T> inLocal;
+    LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.AllocTensor<int8_t>();
+    LocalTensor<half> halfLocal = halfQueue_.AllocTensor<half>();
+    uint32_t elements = Align(curLoopCols, sizeof(T));
+    if constexpr (IsSameType<T, float>::value) {
+        floatLocal = inputXCopyInQueue_.DeQue<float>();
+    } else {
+        inLocal = inputXCopyInQueue_.DeQue<T>();
+        floatLocal = floatQueue_.AllocTensor<float>();
+        Cast(floatLocal, inLocal, RoundMode::CAST_NONE, elements);
+        PipeBarrier<PIPE_V>();
+    }
+    Muls(floatLocal, floatLocal, scale_, elements);
+    PipeBarrier<PIPE_V>();
+    Adds(floatLocal, floatLocal, offset_, elements);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<int32_t> intLocal = floatLocal.ReinterpretCast<int32_t>();
+    Cast(intLocal, floatLocal, RoundMode::CAST_RINT, elements);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    PipeBarrier<PIPE_V>();
+    Cast(halfLocal, intLocal, RoundMode::CAST_ROUND, elements);
+    PipeBarrier<PIPE_V>();
+    Cast(outLocal, halfLocal, RoundMode::CAST_TRUNC, elements);
+    inputXCopyOutQueue_.EnQue(outLocal);
+    if constexpr (IsSameType<T, float>::value) {
+        inputXCopyInQueue_.FreeTensor(floatLocal);
+    } else {
+        inputXCopyInQueue_.FreeTensor(inLocal);
+        floatQueue_.FreeTensor(floatLocal);
+    }
+    halfQueue_.FreeTensor(halfLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::ScatterCopyOut(int64_t progress)
+{
+    LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.DeQue<int32_t>();
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    for (int64_t indicesIndex = 0; indicesIndex < currentLoopRows_; indicesIndex++) {
+        int64_t rowOffset = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
+        int64_t rowIdx = indicesLocal.GetValue(indicesIndex);
+        int64_t xSrcOffset = rowIdx / k_ * cols_;
+        int64_t xDstOffset = (rowOffset + indicesIndex) * cols_;
+        int64_t curLoopCols = perLoopCols_;
+        if (activeNum_ > 0 && dropPadMode_ == DROPLESS_MODE && (rowOffset + indicesIndex) >= activeNum_) {
+            break;
+        }
+        SetWaitFlag<HardEvent::S_MTE2>(HardEvent::S_MTE2);
+        for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
+            if (colsLoop == colLoops_ - 1) {
+                curLoopCols = lastLoopCols_;
+            }
+            int64_t colsLoopOffset = colsLoop * perLoopCols_;
+            CopyXIn(xSrcOffset + colsLoopOffset, curLoopCols);
+            Compute(curLoopCols);
+            CopyXOut(xDstOffset + colsLoopOffset, curLoopCols);
+        }
+    }
+    expandRowIdxCopyInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::GatherCopyOut(int64_t progress)
+{
+    LocalTensor<int32_t> indicesLocal = expandRowIdxCopyInQueue_.DeQue<int32_t>();
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    colsTileLength_ = perLoopCols_;
+    for (int64_t colsLoop = 0; colsLoop < colLoops_; colsLoop++) {
+        int64_t initialRow = perCoreRow_ * blockIdx_ + perLoopRows_ * progress;
+        int64_t curLoopRow = 0;
+        if (colsLoop == colLoops_ - 1) {
+            colsTileLength_ = lastLoopCols_;
+        }
+        int64_t currentLoopStartRow = initialRow / k_;
+        int64_t currentLoopLastRow = (initialRow + currentLoopRows_ - 1) / k_;
+        for (int64_t row = currentLoopStartRow; row <= currentLoopLastRow; row++) {
+            inputOffset_ = row * cols_ + colsLoop * perLoopCols_;
+            // input row position
+            CopyXIn(inputOffset_, colsTileLength_);
+            Compute(colsTileLength_);
+            LocalTensor<int8_t> outLocal = inputXCopyOutQueue_.DeQue<int8_t>();
+            DataCopyExtParams intriParams{1, static_cast<uint32_t>(colsTileLength_ * sizeof(int8_t)), 0, 0, 0};
+            SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+            while (curLoopRow < currentLoopRows_ && initialRow / k_ == row) {
+                int32_t outIndex = indicesLocal.GetValue(curLoopRow);
+                curLoopRow++;
+                initialRow++;
+                if (outIndex == -1 || (dropPadMode_ == DROPLESS_MODE && outIndex >= activeNum_)) {
+                    continue;
+                }
+                outOffset_ = outIndex * cols_ + colsLoop * perLoopCols_;
+                DataCopyPad(expandedXGm_[outOffset_], outLocal, intriParams);
+            }
+            inputXCopyOutQueue_.FreeTensor(outLocal);
+        }
+    }
+    expandRowIdxCopyInQueue_.FreeTensor(indicesLocal);
+}
+
+template <typename T, const int EP>
+__aicore__ inline void MoeGatherOutQuant<T, EP>::Process()
+{
+    if (blockIdx_ < needCoreNum_) {
+        currentLoopRows_ = perLoopRows_;
+        for (int64_t loop = 0; loop < rowLoops_; loop++) {
+            if (loop == rowLoops_ - 1) {
+                currentLoopRows_ = lastLoopRows_;
+            }
+            CopyExpertIn(loop);
+            if constexpr (EP) {
+                ScatterCopyOut(loop);
+            } else {
+                GatherCopyOut(loop);
+            }
+        }
+    }
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_GATHER_STATIC_QUANT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort.h
@@ -0,0 +1,207 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_mrgsort.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_MRGSORT_H
+#define MOE_CUSTOM_MRGSORT_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+struct MoeMrgsortParam {
+    int64_t perListElements;
+    int64_t lastListElements;
+    int64_t oneLoopMaxElements;
+};
+
+class MoeMrgsort {
+public:
+    __aicore__ inline MoeMrgsort(){};
+    __aicore__ inline void Init(MoeMrgsortParam *param);
+    __aicore__ inline void Process();
+    __aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput);
+    __aicore__ inline void SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput);
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void UpdateMrgParam();
+    __aicore__ inline void MrgsortCompute();
+    __aicore__ inline void UpdateSortInfo();
+    __aicore__ inline void CopyOut();
+    __aicore__ inline void ClearCache();
+
+private:
+    MoeMrgsortParam *param = nullptr;
+
+    GlobalTensor<float> gmInputs[4];
+    GlobalTensor<float> gmOutput;
+
+    LocalTensor<float> ubInputs[4];
+    LocalTensor<float> ubOutput;
+
+    int64_t listNum{0};
+    int64_t remainListNum{0};
+    int64_t outOffset{0};
+    int64_t offsets[4];
+    int64_t listRemainElements[4];
+    int64_t lengths[4];
+    int64_t allRemainElements{0};
+    int64_t curLoopSortedNum{0};
+
+    // for MrgSort
+    uint16_t validBitTail{0};
+    uint16_t elementCountListTail[4];
+    uint32_t listSortedNums[4];
+    LocalTensor<float> tmpUbInputs[4];
+};
+
+__aicore__ inline void MoeMrgsort::ClearCache()
+{
+    this->listNum = 0;
+    this->allRemainElements = 0;
+    this->outOffset = 0;
+}
+
+__aicore__ inline void MoeMrgsort::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput)
+{
+    this->gmInputs[listNum] = gmInput;
+    this->ubInputs[listNum] = ubInput;
+    this->listNum += 1;
+}
+
+__aicore__ inline void MoeMrgsort::SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput)
+{
+    this->gmOutput = gmOutput;
+    this->ubOutput = ubOutput;
+}
+
+__aicore__ inline void MoeMrgsort::UpdateMrgParam()
+{
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0011;
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0111;
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        validBitTail = 0b1111;
+    } else {
+        validBitTail = 0b0001;
+    }
+}
+
+__aicore__ inline void MoeMrgsort::CopyIn()
+{
+    this->remainListNum = 0;
+    event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
+    SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
+        if (lengths[i] > 0) {
+            DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
+                     Align(GetSortLen<float>(lengths[i]), sizeof(float)));
+            tmpUbInputs[j] = this->ubInputs[i];
+            elementCountListTail[j] = lengths[i];
+            this->remainListNum += 1;
+            j++;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsort::MrgsortCompute()
+{
+    event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
+    SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
+        MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        MrgSortSrcList sortListTail =
+            MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
+        MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
+                                                     tmpUbInputs[MERGE_LIST_IDX_THREE]);
+        MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else {
+        DataCopy(this->ubOutput, this->tmpUbInputs[0],
+                 Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
+        listSortedNums[0] = elementCountListTail[0];
+    }
+}
+
+__aicore__ inline void MoeMrgsort::UpdateSortInfo()
+{
+    curLoopSortedNum = 0;
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        if (lengths[i] > 0) {
+            // update remain size
+            listRemainElements[i] -= listSortedNums[j];
+            allRemainElements -= listSortedNums[j];
+            // update offset
+            offsets[i] += GetSortOffset<float>(listSortedNums[j]);
+            // update current loop sorted nums
+            curLoopSortedNum += listSortedNums[j];
+            j += 1;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsort::CopyOut()
+{
+    DataCopyParams intriParams;
+    intriParams.blockCount = 1;
+    intriParams.blockLen = GetSortLen<float>(curLoopSortedNum) * sizeof(float);
+    event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
+    SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    DataCopyPad(this->gmOutput[outOffset], this->ubOutput, intriParams);
+    outOffset += GetSortLen<float>(curLoopSortedNum);
+}
+
+__aicore__ inline void MoeMrgsort::Init(MoeMrgsortParam *param)
+{
+    this->param = param;
+    this->remainListNum = listNum;
+
+    for (int64_t i = 0; i < listNum; i++) {
+        offsets[i] = GetSortOffset<float>(param->perListElements * i);
+        if (i == listNum - 1) {
+            listRemainElements[i] = param->lastListElements;
+        } else {
+            listRemainElements[i] = param->perListElements;
+        }
+        allRemainElements += listRemainElements[i];
+    }
+}
+
+__aicore__ inline void MoeMrgsort::Process()
+{
+    for (; allRemainElements > 0;) {
+        CopyIn();
+        UpdateMrgParam();
+        MrgsortCompute();
+        UpdateSortInfo();
+        CopyOut();
+    }
+
+    ClearCache();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_MRGSORT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_out.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_out.h
@@ -0,0 +1,232 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_mrgsort_out.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_MRGSORT_OUT_H
+#define MOE_CUSTOM_MRGSORT_OUT_H
+
+#include "moe_custom_mrgsort.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+class MoeMrgsortOut {
+public:
+    __aicore__ inline MoeMrgsortOut(){};
+    __aicore__ inline void Init(MoeMrgsortParam *param, TPipe *tPipe);
+    __aicore__ inline void Process();
+    __aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput);
+    __aicore__ inline void SetOutput(GlobalTensor<int32_t> &gmOutput1, GlobalTensor<int32_t> &gmOutput2,
+                                     LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2);
+    __aicore__ inline void SetBuffer(LocalTensor<float> &tempBuffer);
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void UpdateMrgParam();
+    __aicore__ inline void MrgsortCompute();
+    __aicore__ inline void UpdateSortInfo();
+    __aicore__ inline void Extract();
+    __aicore__ inline void CopyOut();
+    __aicore__ inline void ClearCache();
+
+private:
+    MoeMrgsortParam *param = nullptr;
+
+    GlobalTensor<float> gmInputs[4];
+    GlobalTensor<int32_t> gmOutput1;
+    GlobalTensor<int32_t> gmOutput2;
+
+    LocalTensor<float> ubInputs[4];
+    LocalTensor<float> tempBuffer;
+
+    // for extract
+    LocalTensor<float> ubOutput1;
+    LocalTensor<uint32_t> ubOutput2;
+
+    // for copy out
+    LocalTensor<int32_t> ubOutputInt1;
+    LocalTensor<int32_t> ubOutputInt2;
+
+    int64_t listNum{0};
+    int64_t remainListNum{0};
+    int64_t outOffset{0};
+    int64_t offsets[4];
+    int64_t listRemainElements[4];
+    int64_t lengths[4];
+    int64_t allRemainElements{0};
+    int64_t curLoopSortedNum{0};
+
+    // for MrgSort
+    uint16_t validBitTail;
+    uint16_t elementCountListTail[4];
+    uint32_t listSortedNums[4];
+    LocalTensor<float> tmpUbInputs[4];
+};
+
+__aicore__ inline void MoeMrgsortOut::ClearCache()
+{
+    this->listNum = 0;
+    this->allRemainElements = 0;
+    this->outOffset = 0;
+}
+
+__aicore__ inline void MoeMrgsortOut::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput)
+{
+    this->gmInputs[listNum] = gmInput;
+    this->ubInputs[listNum] = ubInput;
+    this->listNum += 1;
+}
+
+__aicore__ inline void MoeMrgsortOut::SetOutput(GlobalTensor<int32_t> &gmOutput1, GlobalTensor<int32_t> &gmOutput2,
+                                                LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2)
+{
+    this->gmOutput1 = gmOutput1;
+    this->ubOutput1 = ubOutput1;
+    this->ubOutputInt1 = ubOutput1.ReinterpretCast<int32_t>();
+
+    this->gmOutput2 = gmOutput2;
+    this->ubOutput2 = ubOutput2.ReinterpretCast<uint32_t>();
+    this->ubOutputInt2 = ubOutput2.ReinterpretCast<int32_t>();
+}
+
+__aicore__ inline void MoeMrgsortOut::SetBuffer(LocalTensor<float> &tempBuffer)
+{
+    this->tempBuffer = tempBuffer;
+}
+
+__aicore__ inline void MoeMrgsortOut::UpdateMrgParam()
+{
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0011;
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0111;
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        validBitTail = 0b1111;
+    } else {
+        validBitTail = 0b0001;
+    }
+}
+
+__aicore__ inline void MoeMrgsortOut::CopyIn()
+{
+    this->remainListNum = 0;
+    event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
+    SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
+        if (lengths[i] > 0) {
+            DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
+                     Align(GetSortLen<float>(lengths[i]), sizeof(float)));
+            tmpUbInputs[j] = this->ubInputs[i];
+            elementCountListTail[j] = lengths[i];
+            this->remainListNum += 1;
+            j++;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsortOut::MrgsortCompute()
+{
+    event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
+    SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
+        MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        MrgSortSrcList sortListTail =
+            MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
+        MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
+                                                     tmpUbInputs[MERGE_LIST_IDX_THREE]);
+        MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else {
+        DataCopy(this->tempBuffer, this->tmpUbInputs[0],
+                 Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
+        listSortedNums[0] = elementCountListTail[0];
+    }
+}
+
+__aicore__ inline void MoeMrgsortOut::UpdateSortInfo()
+{
+    curLoopSortedNum = 0;
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        if (lengths[i] > 0) {
+            // update remain size
+            listRemainElements[i] -= listSortedNums[j];
+            allRemainElements -= listSortedNums[j];
+            // update offset
+            offsets[i] += GetSortOffset<float>(listSortedNums[j]);
+            // update current loop sorted nums
+            curLoopSortedNum += listSortedNums[j];
+            j += 1;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsortOut::Extract()
+{
+    AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
+    Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
+    Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
+}
+
+__aicore__ inline void MoeMrgsortOut::CopyOut()
+{
+    DataCopyParams intriParams;
+    intriParams.blockCount = 1;
+    intriParams.blockLen = curLoopSortedNum * sizeof(int32_t);
+    event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
+    SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    DataCopyPad(this->gmOutput1[outOffset], this->ubOutputInt1, intriParams);
+    DataCopyPad(this->gmOutput2[outOffset], this->ubOutputInt2, intriParams);
+
+    outOffset += curLoopSortedNum;
+}
+
+__aicore__ inline void MoeMrgsortOut::Init(MoeMrgsortParam *param, TPipe *tPipe)
+{
+    this->param = param;
+    this->allRemainElements = 0;
+    for (int64_t i = 0; i < listNum; i++) {
+        offsets[i] = GetSortOffset<float>(param->perListElements * i);
+        if (i == listNum - 1) {
+            listRemainElements[i] = param->lastListElements;
+        } else {
+            listRemainElements[i] = param->perListElements;
+        }
+        allRemainElements += listRemainElements[i];
+    }
+}
+
+__aicore__ inline void MoeMrgsortOut::Process()
+{
+    for (; allRemainElements > 0;) {
+        CopyIn();
+        UpdateMrgParam();
+        MrgsortCompute();
+        UpdateSortInfo();
+        Extract();
+        CopyOut();
+    }
+    ClearCache();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_MRGSORT_OUT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_out_performance.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_out_performance.h
@@ -0,0 +1,239 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_mrgsort_out_performance.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_MRGSORT_OUT_PERFORMANCE_H
+#define MOE_CUSTOM_MRGSORT_OUT_PERFORMANCE_H
+
+#include "moe_custom_mrgsort_performance.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+constexpr int64_t MAX_MRGSORT_LIST = 4;
+constexpr int64_t MAX_MRGSORT_LIST_TOTAL = 16;
+
+class MoeMrgsortOutPerformance {
+public:
+    __aicore__ inline MoeMrgsortOutPerformance(){};
+    __aicore__ inline void Init(MoeMrgsortPerformanceParam *param, TPipe *tPipe);
+    __aicore__ inline void Process();
+    __aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
+                                    GlobalTensor<int32_t> &gmActualSortNum);
+    __aicore__ inline void SetOutput(GlobalTensor<int32_t> &gmOutput1, GlobalTensor<int32_t> &gmOutput2,
+                                     LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2);
+    __aicore__ inline void SetBuffer(LocalTensor<float> &tempBuffer);
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void UpdateMrgParam();
+    __aicore__ inline void MrgsortCompute();
+    __aicore__ inline void UpdateSortInfo();
+    __aicore__ inline void Extract();
+    __aicore__ inline void CopyOut();
+    __aicore__ inline void ClearCache();
+
+private:
+    MoeMrgsortPerformanceParam *param = nullptr;
+
+    GlobalTensor<float> gmInputs[4];
+    GlobalTensor<int32_t> gmOutput1;
+    GlobalTensor<int32_t> gmOutput2;
+    GlobalTensor<int32_t> gmActualSortNum;
+
+    LocalTensor<float> ubInputs[4];
+    LocalTensor<float> tempBuffer;
+
+    // for extract
+    LocalTensor<float> ubOutput1;
+    LocalTensor<uint32_t> ubOutput2;
+
+    // for copy out
+    LocalTensor<int32_t> ubOutputInt1;
+    LocalTensor<int32_t> ubOutputInt2;
+
+    int64_t listNum{0};
+    int64_t remainListNum{0};
+    int64_t outOffset{0};
+    int64_t offsets[4] = {0};
+    int64_t listRemainElements[4] = {0};
+    int64_t lengths[4] = {0};
+    int64_t allRemainElements{0};
+    int64_t curLoopSortedNum{0};
+
+    // for MrgSort
+    uint16_t validBitTail;
+    uint16_t elementCountListTail[4] = {0};
+    uint32_t listSortedNums[4] = {0};
+    LocalTensor<float> tmpUbInputs[4];
+};
+
+__aicore__ inline void MoeMrgsortOutPerformance::ClearCache()
+{
+    this->listNum = 0;
+    this->allRemainElements = 0;
+    this->outOffset = 0;
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
+                                                          GlobalTensor<int32_t> &gmActualSortNum)
+{
+    if (this->listNum == 0) {
+        this->gmActualSortNum = gmActualSortNum;
+    }
+    this->gmInputs[listNum] = gmInput;
+    this->ubInputs[listNum] = ubInput;
+    this->listNum += 1;
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::SetOutput(GlobalTensor<int32_t> &gmOutput1,
+                                                           GlobalTensor<int32_t> &gmOutput2,
+                                                           LocalTensor<float> &ubOutput1, LocalTensor<float> &ubOutput2)
+{
+    this->gmOutput1 = gmOutput1;
+    this->ubOutput1 = ubOutput1;
+    this->ubOutputInt1 = ubOutput1.ReinterpretCast<int32_t>();
+
+    this->gmOutput2 = gmOutput2;
+    this->ubOutput2 = ubOutput2.ReinterpretCast<uint32_t>();
+    this->ubOutputInt2 = ubOutput2.ReinterpretCast<int32_t>();
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::SetBuffer(LocalTensor<float> &tempBuffer)
+{
+    this->tempBuffer = tempBuffer;
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::UpdateMrgParam()
+{
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0011;
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0111;
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        validBitTail = 0b1111;
+    } else {
+        validBitTail = 0b0001;
+    }
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::CopyIn()
+{
+    this->remainListNum = 0;
+    event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
+    SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
+        if (lengths[i] > 0) {
+            DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
+                     Align(GetSortLen<float>(lengths[i]), sizeof(float)));
+            tmpUbInputs[j] = this->ubInputs[i];
+            elementCountListTail[j] = lengths[i];
+            this->remainListNum += 1;
+            j++;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::MrgsortCompute()
+{
+    event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
+    SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
+        MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        MrgSortSrcList sortListTail =
+            MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
+        MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
+                                                     tmpUbInputs[MERGE_LIST_IDX_THREE]);
+        MrgSort<float, true>(this->tempBuffer, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else {
+        DataCopy(this->tempBuffer, this->tmpUbInputs[0],
+                 Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
+        listSortedNums[0] = elementCountListTail[0];
+    }
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::UpdateSortInfo()
+{
+    curLoopSortedNum = 0;
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        if (lengths[i] > 0) {
+            // update remain size
+            listRemainElements[i] -= listSortedNums[j];
+            allRemainElements -= listSortedNums[j];
+            // update offset
+            offsets[i] += GetSortOffset<float>(listSortedNums[j]);
+            // update current loop sorted nums
+            curLoopSortedNum += listSortedNums[j];
+            j += 1;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::Extract()
+{
+    AscendC::Extract(this->ubOutput1, this->ubOutput2, this->tempBuffer, Ceil(curLoopSortedNum, ONE_REPEAT_SORT_NUM));
+    Muls(this->ubOutput1, this->ubOutput1, (float)-1, Align(curLoopSortedNum, sizeof(float)));
+    Cast(this->ubOutputInt1, this->ubOutput1, RoundMode::CAST_ROUND, Align(curLoopSortedNum, sizeof(float)));
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::CopyOut()
+{
+    DataCopyParams intriParams;
+    intriParams.blockCount = 1;
+    intriParams.blockLen = curLoopSortedNum * sizeof(int32_t);
+    event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
+    SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    DataCopyPad(this->gmOutput1[outOffset], this->ubOutputInt1, intriParams);
+    DataCopyPad(this->gmOutput2[outOffset], this->ubOutputInt2, intriParams);
+
+    outOffset += curLoopSortedNum;
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::Init(MoeMrgsortPerformanceParam *param, TPipe *tPipe)
+{
+    this->param = param;
+    for (int64_t i = 0; i < MAX_MRGSORT_LIST_TOTAL; i++) {
+        listRemainElements[i / MAX_MRGSORT_LIST] += static_cast<int64_t>(gmActualSortNum.GetValue(i));
+    }
+    for (int64_t i = 0; i < listNum; i++) {
+        offsets[i] = GetSortOffset<float>(param->perListElements * i * MAX_MRGSORT_LIST);
+        allRemainElements += listRemainElements[i];
+    }
+}
+
+__aicore__ inline void MoeMrgsortOutPerformance::Process()
+{
+    for (; allRemainElements > 0;) {
+        CopyIn();
+        UpdateMrgParam();
+        MrgsortCompute();
+        UpdateSortInfo();
+        Extract();
+        CopyOut();
+    }
+    ClearCache();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_MRGSORT_OUT_PERFORMANCE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_performance.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_mrgsort_performance.h
@@ -0,0 +1,206 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_mrgsort_performance.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_MRGSORT_PERFORMANCE_H
+#define MOE_CUSTOM_MRGSORT_PERFORMANCE_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+struct MoeMrgsortPerformanceParam {
+    int64_t perListElements;
+    int64_t oneLoopMaxElements;
+};
+
+class MoeMrgsortPerformance {
+public:
+    __aicore__ inline MoeMrgsortPerformance(){};
+    __aicore__ inline void Init(MoeMrgsortPerformanceParam *param);
+    __aicore__ inline void Process();
+    __aicore__ inline void SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
+                                    GlobalTensor<int32_t> &gmActualSortNum);
+    __aicore__ inline void SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput);
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void UpdateMrgParam();
+    __aicore__ inline void MrgsortCompute();
+    __aicore__ inline void UpdateSortInfo();
+    __aicore__ inline void CopyOut();
+    __aicore__ inline void ClearCache();
+
+private:
+    MoeMrgsortPerformanceParam *param = nullptr;
+
+    GlobalTensor<float> gmInputs[4];
+    GlobalTensor<float> gmOutput;
+    GlobalTensor<int32_t> gmActualSortNum;
+
+    LocalTensor<float> ubInputs[4];
+    LocalTensor<float> ubOutput;
+
+    int64_t listNum{0};
+    int64_t remainListNum{0};
+    int64_t outOffset{0};
+    int64_t offsets[4];
+    int64_t listRemainElements[4];
+    int64_t lengths[4];
+    int64_t allRemainElements{0};
+    int64_t curLoopSortedNum{0};
+
+    // for MrgSort
+    uint16_t validBitTail{0};
+    uint16_t elementCountListTail[4];
+    uint32_t listSortedNums[4];
+    LocalTensor<float> tmpUbInputs[4];
+};
+
+__aicore__ inline void MoeMrgsortPerformance::ClearCache()
+{
+    this->listNum = 0;
+    this->allRemainElements = 0;
+    this->outOffset = 0;
+}
+
+__aicore__ inline void MoeMrgsortPerformance::SetInput(GlobalTensor<float> &gmInput, LocalTensor<float> &ubInput,
+                                                       GlobalTensor<int32_t> &gmActualSortNum)
+{
+    if (this->listNum == 0) {
+        this->gmActualSortNum = gmActualSortNum;
+    }
+    this->gmInputs[listNum] = gmInput;
+    this->ubInputs[listNum] = ubInput;
+    this->listNum += 1;
+}
+
+__aicore__ inline void MoeMrgsortPerformance::SetOutput(GlobalTensor<float> &gmOutput, LocalTensor<float> &ubOutput)
+{
+    this->gmOutput = gmOutput;
+    this->ubOutput = ubOutput;
+}
+
+__aicore__ inline void MoeMrgsortPerformance::UpdateMrgParam()
+{
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        elementCountListTail[MERGE_LIST_IDX_TWO] = 0;
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0011;
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        elementCountListTail[MERGE_LIST_IDX_THREE] = 0;
+        validBitTail = 0b0111;
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        validBitTail = 0b1111;
+    } else {
+        validBitTail = 0b0001;
+    }
+}
+
+__aicore__ inline void MoeMrgsortPerformance::CopyIn()
+{
+    this->remainListNum = 0;
+    event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
+    SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        lengths[i] = Min(param->oneLoopMaxElements, listRemainElements[i]);
+        if (lengths[i] > 0) {
+            DataCopy(this->ubInputs[i], this->gmInputs[i][offsets[i]],
+                     Align(GetSortLen<float>(lengths[i]), sizeof(float)));
+            tmpUbInputs[j] = this->ubInputs[i];
+            elementCountListTail[j] = lengths[i];
+            this->remainListNum += 1;
+            j++;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsortPerformance::MrgsortCompute()
+{
+    event_t eventIdMte2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
+    SetFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    WaitFlag<HardEvent::MTE2_V>(eventIdMte2ToV);
+    if (this->remainListNum == MERGE_LIST_TWO) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[0], tmpUbInputs[0]);
+        MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_THREE) {
+        MrgSortSrcList sortListTail =
+            MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO], tmpUbInputs[0]);
+        MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else if (this->remainListNum == MERGE_LIST_FOUR) {
+        MrgSortSrcList sortListTail = MrgSortSrcList(tmpUbInputs[0], tmpUbInputs[1], tmpUbInputs[MERGE_LIST_IDX_TWO],
+                                                     tmpUbInputs[MERGE_LIST_IDX_THREE]);
+        MrgSort<float, true>(this->ubOutput, sortListTail, elementCountListTail, listSortedNums, validBitTail, 1);
+    } else {
+        DataCopy(this->ubOutput, this->tmpUbInputs[0],
+                 Align(GetSortLen<float>(elementCountListTail[0]), sizeof(float)));
+        listSortedNums[0] = elementCountListTail[0];
+    }
+}
+
+__aicore__ inline void MoeMrgsortPerformance::UpdateSortInfo()
+{
+    curLoopSortedNum = 0;
+    for (int64_t i = 0, j = 0; i < listNum; i++) {
+        if (lengths[i] > 0) {
+            // update remain size
+            listRemainElements[i] -= listSortedNums[j];
+            allRemainElements -= listSortedNums[j];
+            // update offset
+            offsets[i] += GetSortOffset<float>(listSortedNums[j]);
+            // update current loop sorted nums
+            curLoopSortedNum += listSortedNums[j];
+            j += 1;
+        }
+    }
+}
+
+__aicore__ inline void MoeMrgsortPerformance::CopyOut()
+{
+    DataCopyParams intriParams;
+    intriParams.blockCount = 1;
+    intriParams.blockLen = GetSortLen<float>(curLoopSortedNum) * sizeof(float);
+    event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
+    SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+    DataCopyPad(this->gmOutput[outOffset], this->ubOutput, intriParams);
+    outOffset += GetSortLen<float>(curLoopSortedNum);
+}
+
+__aicore__ inline void MoeMrgsortPerformance::Init(MoeMrgsortPerformanceParam *param)
+{
+    this->param = param;
+    for (int64_t i = 0; i < listNum; i++) {
+        offsets[i] = GetSortOffset<float>(param->perListElements * i);
+        listRemainElements[i] = static_cast<int64_t>(gmActualSortNum.GetValue(i));
+        allRemainElements += listRemainElements[i];
+    }
+}
+
+__aicore__ inline void MoeMrgsortPerformance::Process()
+{
+    for (; allRemainElements > 0;) {
+        CopyIn();
+        UpdateMrgParam();
+        MrgsortCompute();
+        UpdateSortInfo();
+        CopyOut();
+    }
+
+    ClearCache();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_MRGSORT_PERFORMANCE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_row_idx_gather.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_row_idx_gather.h
@@ -0,0 +1,204 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_row_idx_gather.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_ROW_IDX_GATHER_H
+#define MOE_CUSTOM_ROW_IDX_GATHER_H
+
+#include "moe_custom_common.h"
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+class RowIdxGather {
+public:
+    __aicore__ inline RowIdxGather(){};
+    __aicore__ inline void Init(GM_ADDR expandedRowIdx, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
+                                TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn(int64_t loop, int64_t elements);
+    __aicore__ inline void Compute(int64_t loop, int64_t elements);
+    __aicore__ inline void CopyOut(int64_t loop, int64_t elements, GlobalTensor<int32_t> &RowIdxDstGm_);
+    __aicore__ inline void AssistInit();
+
+private:
+    GlobalTensor<int32_t> expandedRowIdxGm_;
+    GlobalTensor<int32_t> sortedExpertIndicesGm_;
+    GlobalTensor<int64_t> expertTokensCountGm_;
+    GlobalTensor<int32_t> expertTotalCountGm_;
+    GlobalTensor<int32_t> assistGm_;
+    GlobalTensor<int32_t> gatherIndicesGm_;
+
+    TPipe *pipe_;
+
+    TQue<QuePosition::VECIN, 1> sortedExpertIndicesInQueue_;
+    TQue<QuePosition::VECOUT, 1> copyOutQueue_;
+    TBuf<TPosition::VECCALC> assistBuffer_;
+
+    const MoeCustomSrcToDstComputeTilingData *srcToDstComputeTilingData_;
+    int64_t blockIdx_;
+    int64_t needCoreNum_;
+    int64_t perCoreElements_;
+    int64_t actualExpertNum_ = 0;
+    int64_t ep_ = 0;
+    int64_t rowIdxType_ = 0;
+    int64_t expertTotalCount_ = 0;
+
+    int64_t loops_ = 0;
+    int64_t perLoopElements_ = 0;
+    int64_t lastLoopElements_ = 0;
+};
+
+__aicore__ inline void RowIdxGather::AssistInit()
+{
+    LocalTensor<int32_t> assistTensor = assistBuffer_.Get<int32_t>(ASSIST_NUM);
+    DataCopy(assistTensor, assistGm_, ASSIST_NUM);
+    SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
+    Adds(assistTensor, assistTensor, (int32_t)(blockIdx_ * perCoreElements_), ASSIST_NUM);
+}
+
+__aicore__ inline void RowIdxGather::Init(GM_ADDR expandedRowIdx, GM_ADDR workspace,
+                                          const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    pipe_ = tPipe;
+    srcToDstComputeTilingData_ = &(tilingData->srcToDstComputeParamsOp);
+    blockIdx_ = GetBlockIdx();
+    actualExpertNum_ = tilingData->actualExpertNum;
+    ep_ = tilingData->ep;
+    rowIdxType_ = tilingData->rowIdxType;
+
+    expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, actualExpertNum_);
+
+    if (ep_) {
+        expertTotalCountGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                                Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2 +
+                                                Align(actualExpertNum_, sizeof(int32_t)),
+                                            actualExpertNum_);
+        AscendC::DataCacheCleanAndInvalid<int32_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
+                                          AscendC::DcciDst::CACHELINE_OUT>(expertTotalCountGm_);
+        expertTotalCount_ = expertTotalCountGm_.GetValue(0);
+    } else {
+        expertTotalCount_ = tilingData->n * tilingData->k;
+    }
+    assistGm_.SetGlobalBuffer((__gm__ int32_t *)assist, ASSIST_NUM);
+    perCoreElements_ = Ceil(expertTotalCount_, srcToDstComputeTilingData_->needCoreNum);
+    needCoreNum_ = Ceil(expertTotalCount_, perCoreElements_);
+
+    int64_t lastCoreElements = expertTotalCount_ - (needCoreNum_ - 1) * perCoreElements_;
+    int64_t perCoreLoops = Ceil(perCoreElements_, srcToDstComputeTilingData_->perCorePerLoopElements);
+    int64_t perCorePerLoopElements = Ceil(perCoreElements_, perCoreLoops);
+    int64_t perCoreLastLoopElements = perCoreElements_ - (perCoreLoops - 1) * perCorePerLoopElements;
+
+    int64_t lastCoreLoops = Ceil(lastCoreElements, srcToDstComputeTilingData_->perCorePerLoopElements);
+    int64_t lastCorePerLoopElements = Ceil(lastCoreElements, lastCoreLoops);
+    int64_t lastCoreLastLoopELements = lastCoreElements - (lastCoreLoops - 1) * lastCorePerLoopElements;
+
+    loops_ = perCoreLoops;
+    if (blockIdx_ == needCoreNum_ - 1) {
+        loops_ = lastCoreLoops;
+        perLoopElements_ = lastCorePerLoopElements;
+        lastLoopElements_ = lastCoreLastLoopELements;
+    } else {
+        loops_ = perCoreLoops;
+        perLoopElements_ = perCorePerLoopElements;
+        lastLoopElements_ = perCoreLastLoopElements;
+    }
+
+    if (rowIdxType_ == SCATTER) {
+        sortedExpertIndicesGm_.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx + blockIdx_ * perCoreElements_,
+                                               actualExpertNum_);
+    } else {
+        sortedExpertIndicesGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                                   Align(tilingData->n * tilingData->k, sizeof(int32_t)) +
+                                                   blockIdx_ * perCoreElements_,
+                                               actualExpertNum_);
+    }
+
+    if ((ep_ == 0 && rowIdxType_ == SCATTER) && (blockIdx_ < needCoreNum_)) {
+        expandedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                          Align(tilingData->n * tilingData->k, sizeof(int32_t)));
+    }
+    pipe_->InitBuffer(sortedExpertIndicesInQueue_, 1, AlignBytes(perLoopElements_, sizeof(int32_t)));
+    pipe_->InitBuffer(copyOutQueue_, 1, Ceil(perLoopElements_, ASSIST_NUM) * ASSIST_NUM * BLOCK_BYTES);
+    pipe_->InitBuffer(assistBuffer_, ASSIST_NUM * sizeof(int32_t));
+}
+
+__aicore__ inline void RowIdxGather::Process()
+{
+    if (ep_ == 1 && rowIdxType_ == SCATTER) {
+        return;
+    } else {
+        if (blockIdx_ < needCoreNum_) {
+            AssistInit();
+            for (int64_t loop = 0; loop < loops_; loop++) {
+                int64_t elements = perLoopElements_;
+                if (loop == loops_ - 1) {
+                    elements = lastLoopElements_;
+                }
+                CopyIn(loop, elements);
+                Compute(loop, elements);
+                CopyOut(loop, elements, expandedRowIdxGm_);
+            }
+        }
+    }
+    AscendC::SyncAll();
+}
+
+__aicore__ inline void RowIdxGather::CopyIn(int64_t loop, int64_t elements)
+{
+    LocalTensor<int32_t> sortedExpertIndicesInLocal = sortedExpertIndicesInQueue_.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(elements * sizeof(int32_t)), 0, 0,
+                                     0};
+    DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(sortedExpertIndicesInLocal, sortedExpertIndicesGm_[loop * perLoopElements_], dataCopyParams,
+                dataCopyPadParams);
+    sortedExpertIndicesInQueue_.EnQue(sortedExpertIndicesInLocal);
+}
+
+__aicore__ inline void RowIdxGather::Compute(int64_t loop, int64_t elements)
+{
+    LocalTensor<int32_t> outLocal = copyOutQueue_.AllocTensor<int32_t>();
+    LocalTensor<int32_t> assistTensor = assistBuffer_.Get<int32_t>(ASSIST_NUM);
+    PipeBarrier<PIPE_V>();
+    int64_t loops = Ceil(elements, ASSIST_INDEX_NUM);
+    for (int64_t i = 0; i < loops; i++) {
+        Adds(outLocal[i * ASSIST_NUM], assistTensor,
+             static_cast<int32_t>(perLoopElements_ * loop + i * ASSIST_INDEX_NUM), ASSIST_NUM);
+    }
+    PipeBarrier<PIPE_V>();
+    copyOutQueue_.EnQue<int32_t>(outLocal);
+}
+
+__aicore__ inline void RowIdxGather::CopyOut(int64_t loop, int64_t elements, GlobalTensor<int32_t> &RowIdxDstGm_)
+{
+    LocalTensor<int32_t> inLocal = sortedExpertIndicesInQueue_.DeQue<int32_t>();
+    LocalTensor<int32_t> outLocal = copyOutQueue_.DeQue<int32_t>();
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    DataCopyParams intriParams;
+    intriParams.blockCount = 1;
+    intriParams.blockLen = sizeof(int32_t);
+    uint32_t outOffset;
+    for (int64_t idx = 0; idx < elements; idx++) {
+        outOffset = inLocal.GetValue(idx);
+        SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+        DataCopyPad(RowIdxDstGm_[outOffset], outLocal[idx * INT32_ONE_BLOCK_NUM], intriParams);
+    }
+
+    sortedExpertIndicesInQueue_.FreeTensor(inLocal);
+    copyOutQueue_.FreeTensor(outLocal);
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_ROW_IDX_GATHER_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_row_idx_gather_droppad.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_row_idx_gather_droppad.h
@@ -0,0 +1,306 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_row_idx_gather_droppad.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_H
+#define MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_H
+
+#include "moe_custom_common.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+template <typename T, typename TilingData>
+class MoeCustomSrcToDstWithCapacity {
+public:
+    __aicore__ inline MoeCustomSrcToDstWithCapacity(){};
+    __aicore__ inline void Init(GM_ADDR expandedRowIdx, GM_ADDR expandedX, GM_ADDR expandedScale, GM_ADDR workspace,
+                                const TilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn(int64_t progress);
+    __aicore__ inline void CopyOut(int64_t progress);
+    __aicore__ inline void CopyOutRemain();
+    __aicore__ inline void SyncAll();
+    __aicore__ inline void AssistInit();
+
+private:
+    TPipe *pipe;
+    TQue<QuePosition::VECIN, 1> copyInQueue;
+    TQue<QuePosition::VECOUT, 1> copyOutQueue;
+    TQue<QuePosition::VECOUT, 1> copyOutZeroQueue;
+    TQue<QuePosition::VECOUT, 1> scaleOutZeroQueue;
+
+    GlobalTensor<int32_t> expandDstToSrcRowGm;
+    GlobalTensor<int32_t> expandedRowIdxGm;
+    GlobalTensor<int32_t> expertIdxValueGm;
+    GlobalTensor<int32_t> expandedExpertIdxGm;
+    GlobalTensor<T> expandedXGm;
+    GlobalTensor<float> expandedScaleGm;
+
+    LocalTensor<T> outTmpLocal;
+    LocalTensor<float> scaleLocal;
+
+    const MoeCustomSrcToDstCapacityComputeTilingData *srcToDstTilingData;
+    int64_t coreNum;
+    int64_t blockIdx;
+    int64_t totalLength;
+    int64_t currentLoopRows;
+    int64_t coreRows;
+    int64_t perLoopRows;
+    int64_t lastLoopRows;
+    int64_t rowLoops;
+    int64_t expertCapacity;
+    int64_t expertNum;
+    int64_t cols;
+    int64_t perLoopCols;
+    int64_t lastLoopCols;
+    int64_t colLoops;
+    int64_t isInputScale_;
+    int64_t quantMode_;
+
+    int64_t tokenCount = 0;
+    int32_t lastExpertId = -1;
+    int32_t lastCoreExpertId = 0;
+    int32_t lastCoreExpertIdNum = 0;
+    bool needScaleCopy = false;
+};
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::AssistInit()
+{
+    if constexpr (IsSameType<T, int8_t>::value) {
+        LocalTensor<int16_t> outLocal = copyOutZeroQueue.AllocTensor<int16_t>();
+        Duplicate<int16_t>(outLocal, static_cast<int16_t>(0), this->perLoopCols);
+        copyOutZeroQueue.EnQue<int16_t>(outLocal);
+    } else {
+        LocalTensor<T> outLocal = copyOutZeroQueue.AllocTensor<T>();
+        Duplicate<T>(outLocal, static_cast<T>(0), this->perLoopCols);
+        copyOutZeroQueue.EnQue<T>(outLocal);
+    }
+    if (this->needScaleCopy) {
+        LocalTensor<float> scaleOutLocal = scaleOutZeroQueue.AllocTensor<float>();
+        Duplicate<float>(scaleOutLocal, 0.0f, FP32_ONE_BLOCK_NUM);
+        scaleOutZeroQueue.EnQue<float>(scaleOutLocal);
+    }
+
+    if (this->blockIdx != 0) {
+        this->lastCoreExpertId = expertIdxValueGm.GetValue((this->blockIdx - 1) * 2);
+        this->lastCoreExpertIdNum = expertIdxValueGm.GetValue((this->blockIdx - 1) * 2 + 1);
+        for (int64_t i = this->blockIdx - 2; i >= 0; i--) {
+            int32_t lastExpertIdx = expertIdxValueGm.GetValue(i * 2);
+            if (lastExpertIdx < this->lastCoreExpertId) {
+                break;
+            }
+            int32_t lastExpertNum = expertIdxValueGm.GetValue(i * 2 + 1);
+            this->lastCoreExpertIdNum += lastExpertNum;
+        }
+    }
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::CopyIn(int64_t progress)
+{
+    LocalTensor<int32_t> inLocal = copyInQueue.AllocTensor<int32_t>();
+    int64_t length = Align(currentLoopRows, sizeof(int32_t));
+    DataCopy(inLocal, expandDstToSrcRowGm[progress * perLoopRows], length);
+    DataCopy(inLocal[length], expandedExpertIdxGm[progress * perLoopRows], length);
+    copyInQueue.EnQue<int32_t>(inLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::CopyOut(int64_t progress)
+{
+    LocalTensor<int32_t> inLocal = copyInQueue.DeQue<int32_t>();
+    LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
+    int64_t length = Align(currentLoopRows, sizeof(int32_t));
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+    DataCopyExtParams ScaleParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    if (this->lastExpertId == -1) {
+        this->lastExpertId = this->lastCoreExpertId;
+        this->tokenCount = this->lastCoreExpertIdNum;
+    }
+    for (int64_t idx = 0; idx < currentLoopRows; idx++) {
+        int32_t expertIdx = inLocal[length].GetValue(idx);
+        int32_t index = 0;
+        while (this->lastExpertId < expertIdx) {
+            while (this->tokenCount < this->expertCapacity) {
+                index = this->lastExpertId * this->expertCapacity + this->tokenCount;
+                if (this->needScaleCopy) {
+                    DataCopyPad(expandedScaleGm[index], this->scaleLocal, ScaleParams);
+                }
+                int64_t col = this->perLoopCols;
+                for (int64_t i = 0; i < this->colLoops; i++) {
+                    if (i == this->colLoops - 1) {
+                        col = this->lastLoopCols;
+                    }
+                    DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(T)), 0,
+                                                  0, 0};
+                    DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal,
+                                copyParams1);
+                }
+                this->tokenCount++;
+            }
+            this->tokenCount = 0;
+            this->lastExpertId++;
+        }
+
+        if (this->tokenCount < this->expertCapacity) {
+            int32_t outOffset = inLocal.GetValue(idx);
+            index = expertIdx * this->expertCapacity + this->tokenCount;
+            outLocal.SetValue(0, index);
+            SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+            DataCopyPad(expandedRowIdxGm[outOffset], outLocal, copyParams);
+            this->tokenCount++;
+        }
+    }
+    copyInQueue.FreeTensor(inLocal);
+    copyOutQueue.FreeTensor(outLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::CopyOutRemain()
+{
+    if (this->blockIdx != this->srcToDstTilingData->needCoreNum - 1) {
+        copyOutZeroQueue.FreeTensor(this->outTmpLocal);
+        if (this->needScaleCopy) {
+            scaleOutZeroQueue.FreeTensor(this->scaleLocal);
+        }
+        return;
+    }
+    DataCopyExtParams ScaleParams{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+    while (this->lastExpertId < this->expertNum) {
+        while (this->tokenCount < this->expertCapacity) {
+            int32_t index = this->lastExpertId * this->expertCapacity + this->tokenCount;
+            if (this->needScaleCopy) {
+                DataCopyPad(expandedScaleGm[index], this->scaleLocal, ScaleParams);
+            }
+            int64_t col = this->perLoopCols;
+            for (int64_t i = 0; i < this->colLoops; i++) {
+                if (i == this->colLoops - 1) {
+                    col = this->lastLoopCols;
+                }
+                DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(T)), 0, 0, 0};
+                DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal, copyParams);
+                SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
+            }
+            this->tokenCount++;
+        }
+        this->tokenCount = 0;
+        this->lastExpertId++;
+    }
+    copyOutZeroQueue.FreeTensor(this->outTmpLocal);
+    if (this->needScaleCopy) {
+        scaleOutZeroQueue.FreeTensor(this->scaleLocal);
+    }
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::SyncAll()
+{
+    if (coreNum == 1) {
+        return;
+    }
+#ifndef __CCE_KT_TEST__
+    AscendC::SyncAll();
+#endif
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::Init(GM_ADDR expandedRowIdx, GM_ADDR expandedX,
+                                                                      GM_ADDR expandedScale, GM_ADDR workspace,
+                                                                      const TilingData *tilingData,
+                                                                      TPipe *tPipe)
+{
+    int64_t blockNum = GetBlockNum();
+    this->pipe = tPipe;
+    this->blockIdx = GetBlockIdx();
+
+    this->coreNum = tilingData->coreNum;
+    this->totalLength = tilingData->n * tilingData->k;
+    this->srcToDstTilingData = &(tilingData->srcToDstDropPadParamsOp);
+    this->expertNum = tilingData->expertNum;
+    this->expertCapacity = tilingData->expertCapacity;
+    this->cols = tilingData->cols;
+    this->isInputScale_ = tilingData->isInputScale;
+    this->quantMode_ = tilingData->quantMode;
+
+    if (this->blockIdx == this->srcToDstTilingData->needCoreNum - 1) {
+        this->coreRows = this->srcToDstTilingData->lastCoreRows;
+        this->perLoopRows = this->srcToDstTilingData->lastCorePerLoopRows;
+        this->lastLoopRows = this->srcToDstTilingData->lastCoreLastLoopRows;
+        this->rowLoops = this->srcToDstTilingData->lastCoreLoops;
+    } else {
+        this->coreRows = this->srcToDstTilingData->perCoreRows;
+        this->perLoopRows = this->srcToDstTilingData->perCorePerLoopRows;
+        this->lastLoopRows = this->srcToDstTilingData->perCoreLastLoopRows;
+        this->rowLoops = this->srcToDstTilingData->perCoreLoops;
+    }
+    this->perLoopCols = this->srcToDstTilingData->perLoopCols;
+    this->lastLoopCols = this->srcToDstTilingData->lastLoopCols;
+    this->colLoops = this->srcToDstTilingData->colLoops;
+    this->needScaleCopy = (this->isInputScale_ != 0 && this->quantMode_ == -1);
+
+    expandedScaleGm.SetGlobalBuffer((__gm__ float *)expandedScale);
+
+    int64_t length = Align(this->totalLength, sizeof(int32_t));
+    expandedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, length);
+    expandedXGm.SetGlobalBuffer((__gm__ T *)expandedX, this->expertNum * this->expertCapacity * this->cols);
+
+    expandedExpertIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                            this->blockIdx * this->srcToDstTilingData->perCoreRows,
+                                        Align(this->coreRows, sizeof(int32_t)));
+    expandDstToSrcRowGm.SetGlobalBuffer((__gm__ int32_t *)workspace + length +
+                                            this->blockIdx * this->srcToDstTilingData->perCoreRows,
+                                        Align(this->coreRows, sizeof(int32_t)));
+    expertIdxValueGm.SetGlobalBuffer(
+        (__gm__ int32_t *)workspace + length * 2 + Align(this->expertNum, sizeof(int32_t)) * 2, this->coreNum * 2);
+
+    pipe->InitBuffer(copyInQueue, 1, AlignBytes(this->perLoopRows, sizeof(int32_t)) * 2);
+    pipe->InitBuffer(copyOutQueue, 1, AlignBytes(INT32_ONE_BLOCK_NUM, sizeof(int32_t)));
+    if constexpr (IsSameType<T, int8_t>::value) {
+        pipe->InitBuffer(copyOutZeroQueue, 1, AlignBytes(this->perLoopCols, sizeof(int16_t)));
+    } else {
+        pipe->InitBuffer(copyOutZeroQueue, 1, AlignBytes(this->perLoopCols, sizeof(T)));
+    }
+    if (this->needScaleCopy) {
+        pipe->InitBuffer(scaleOutZeroQueue, 1, BLOCK_BYTES);
+    }
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstWithCapacity<T, TilingData>::Process()
+{
+    if (this->blockIdx < this->srcToDstTilingData->needCoreNum) {
+        AssistInit();
+        this->outTmpLocal = copyOutZeroQueue.DeQue<T>();
+        if (this->needScaleCopy) {
+            this->scaleLocal = scaleOutZeroQueue.DeQue<float>();
+        }
+        currentLoopRows = perLoopRows;
+        for (int64_t loop = 0; loop < this->rowLoops; loop++) {
+            if (loop == this->rowLoops - 1) {
+                currentLoopRows = lastLoopRows;
+            }
+            CopyIn(loop);
+            CopyOut(loop);
+        }
+        CopyOutRemain();
+    }
+    this->SyncAll();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_row_idx_gather_droppad_dynamic.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_row_idx_gather_droppad_dynamic.h
@@ -0,0 +1,582 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_row_idx_gather_droppad_dynamic.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_DYNAMIC_H
+#define MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_DYNAMIC_H
+
+#include "moe_custom_common.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+template <typename T, typename TilingData>
+class MoeCustomSrcToDstAndGather {
+public:
+    __aicore__ inline MoeCustomSrcToDstAndGather(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR scale, GM_ADDR expandedRowIdx, GM_ADDR expandedX,
+                                GM_ADDR dynamicQuantScale, GM_ADDR workspace, const TilingData *tilingData,
+                                TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn(int64_t progress);
+    __aicore__ inline void CopyOut(int64_t progress);
+    __aicore__ inline void CopyOutLoops(int64_t progress);
+    __aicore__ inline void Compute(int32_t srcIdx, int32_t dstIdx, int32_t expertIdx);
+    __aicore__ inline float ComputeMax(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal,
+                                       LocalTensor<float> &dynamicQuantLocal, int32_t srcIdx, int32_t expertIdx,
+                                       int64_t j);
+    __aicore__ inline void ComputeScale(LocalTensor<float> &inLocal, LocalTensor<float> &tempLocal, float scaleTemp,
+                                        int64_t dstIndex, int64_t j);
+    __aicore__ inline void ComputeLoops(int32_t srcIdx, int32_t dstIdx, int32_t expertIdx);
+
+    __aicore__ inline void CopyOutRemain();
+    __aicore__ inline void SyncAll();
+    __aicore__ inline void AssistInit();
+
+private:
+    TPipe *pipe;
+    TQue<QuePosition::VECIN, 1> copyInQueue;
+    TQue<QuePosition::VECOUT, 1> copyOutQueue;
+    TQue<QuePosition::VECOUT, 1> copyOutZeroQueue;
+
+    TQue<QuePosition::VECIN, 1> inputXInQueue;
+    TQue<QuePosition::VECIN, 1> smoothInQueue;
+    TQue<QuePosition::VECOUT, 1> calcQueue;
+    TQue<QuePosition::VECOUT, 1> inputXOutQueue;
+    TQue<QuePosition::VECOUT, 1> scaleOutQueue;
+    TQue<QuePosition::VECOUT, 1> scaleOutZeroQueue;
+
+    GlobalTensor<int32_t> expandDstToSrcRowGm;
+    GlobalTensor<int32_t> expandedRowIdxGm;
+    GlobalTensor<int32_t> expertIdxValueGm;
+    GlobalTensor<int32_t> expandedExpertIdxGm;
+    GlobalTensor<int8_t> expandedXGm;
+
+    GlobalTensor<T> inputXGm;
+    GlobalTensor<float> quantSmoothGm;
+    GlobalTensor<float> dynamicQuantScaleGm;
+    GlobalTensor<float> quantSrcGm;
+
+    LocalTensor<int8_t> outTmpLocal;
+    LocalTensor<float> scaleOutTmpLocal;
+    LocalTensor<float> smoothLocal;
+
+    const MoeCustomSrcToDstCapacityComputeTilingData *srcToDstTilingData;
+
+    int64_t coreNum;
+    int64_t blockIdx;
+    int64_t totalLength;
+    int64_t currentLoopRows;
+    int64_t coreRows;
+    int64_t perLoopRows;
+    int64_t lastLoopRows;
+    int64_t rowLoops;
+    int64_t expertCapacity;
+    int64_t expertNum;
+    int64_t cols;
+    int64_t perLoopCols;
+    int64_t lastLoopCols;
+    int64_t colLoops;
+    int64_t perLoopColsAlign;
+    int64_t k;
+    int64_t colsTileLength;
+    int64_t smoothType;
+
+    int64_t tokenCount = 0;
+    int32_t lastExpertId = -1;
+    int32_t lastCoreExpertId = 0;
+    int32_t lastCoreExpertIdNum = 0;
+};
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::AssistInit()
+{
+    LocalTensor<int16_t> outLocal = copyOutZeroQueue.AllocTensor<int16_t>();
+    Duplicate<int16_t>(outLocal, static_cast<int16_t>(0), this->perLoopCols);
+    copyOutZeroQueue.EnQue<int16_t>(outLocal);
+    LocalTensor<float> scaleOutLocal = scaleOutZeroQueue.AllocTensor<float>();
+    Duplicate<float>(scaleOutLocal, 0.0f, 8);
+    scaleOutZeroQueue.EnQue<float>(scaleOutLocal);
+
+    if (this->blockIdx != 0) {
+        this->lastCoreExpertId = expertIdxValueGm.GetValue((this->blockIdx - 1) * EXPERT_ID_VALUE_NUM);
+        this->lastCoreExpertIdNum = expertIdxValueGm.GetValue((this->blockIdx - 1) * EXPERT_ID_VALUE_NUM + 1);
+        for (int64_t i = this->blockIdx - 2; i >= 0; i--) {
+            int32_t lastExpertIdx = expertIdxValueGm.GetValue(i * EXPERT_ID_VALUE_NUM);
+            if (lastExpertIdx < this->lastCoreExpertId) {
+                break;
+            }
+            int32_t lastExpertNum = expertIdxValueGm.GetValue(i * EXPERT_ID_VALUE_NUM + 1);
+            this->lastCoreExpertIdNum += lastExpertNum;
+        }
+    }
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyIn(int64_t progress)
+{
+    LocalTensor<int32_t> inLocal = copyInQueue.AllocTensor<int32_t>();
+    int64_t length = Align(currentLoopRows, sizeof(int32_t));
+    DataCopy(inLocal, expandDstToSrcRowGm[progress * perLoopRows], length);
+    DataCopy(inLocal[length], expandedExpertIdxGm[progress * perLoopRows], length);
+
+    copyInQueue.EnQue<int32_t>(inLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::Compute(int32_t srcIdx, int32_t dstIdx, int32_t expertIdx)
+{
+    DataCopyExtParams copyInParams{1, static_cast<uint32_t>(this->cols * sizeof(T)), 0, 0, 0};
+    DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(this->cols * sizeof(int8_t)), 0, 0, 0};
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    LocalTensor<float> inLocal = inputXInQueue.AllocTensor<float>();
+
+    if constexpr (IsSameType<T, float>::value) {
+        DataCopyPad(inLocal, inputXGm[srcIdx / this->k * this->cols], copyInParams, {false, 0, 0, 0});
+    } else {
+        DataCopyPad(inLocal.template ReinterpretCast<T>()[perLoopColsAlign], inputXGm[srcIdx / this->k * this->cols],
+                    copyInParams, {false, 0, 0, 0});
+    }
+
+    if (smoothType == SCALE_EH) {
+        DataCopyPad(smoothLocal, quantSmoothGm[expertIdx * this->cols], smoothParams, {false, 0, 0, 0});
+    }
+
+    inputXInQueue.EnQue<float>(inLocal);
+    smoothInQueue.EnQue(smoothLocal);
+    smoothLocal = smoothInQueue.DeQue<float>();
+
+    inLocal = inputXInQueue.DeQue<float>();
+
+    LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
+    LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
+    LocalTensor<float> dynamicQuantLocal = scaleOutQueue.AllocTensor<float>();
+
+    if constexpr (!IsSameType<T, float>::value) {
+        Cast(inLocal, inLocal.template ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, this->cols);
+        PipeBarrier<PIPE_V>();
+    }
+
+    if (smoothType != NO_SCALE) {
+        Mul(inLocal, inLocal, smoothLocal, this->cols);
+        PipeBarrier<PIPE_V>();
+    }
+
+    Abs(tempLocal, inLocal, this->cols);
+    PipeBarrier<PIPE_V>();
+
+    ReduceMax(dynamicQuantLocal, tempLocal, tempLocal, this->cols);
+    PipeBarrier<PIPE_V>();
+
+    float maxValue = dynamicQuantLocal.GetValue(0) / MAX_INT8;
+
+    Duplicate<float>(dynamicQuantLocal, maxValue, FP32_ONE_BLOCK_NUM);
+    Duplicate<float>(tempLocal, maxValue, this->cols);
+    PipeBarrier<PIPE_V>();
+
+    Div(tempLocal, inLocal, tempLocal, this->cols);
+    PipeBarrier<PIPE_V>();
+
+    Cast(tempLocal.ReinterpretCast<int32_t>(), tempLocal, RoundMode::CAST_RINT, this->cols);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    Cast(tempLocal.ReinterpretCast<half>(), tempLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND, this->cols);
+    PipeBarrier<PIPE_V>();
+    Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, this->cols);
+
+    calcQueue.FreeTensor(tempLocal);
+    inputXOutQueue.EnQue(outLocal);
+    scaleOutQueue.EnQue(dynamicQuantLocal);
+
+    LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
+    DataCopyPad(dynamicQuantScaleGm[dstIdx], quantScaleLocal, quantScaleParams);
+
+    outLocal = inputXOutQueue.DeQue<int8_t>();
+    DataCopyPad(expandedXGm[dstIdx * this->cols], outLocal, copyOutParams);
+
+    inputXInQueue.FreeTensor(inLocal);
+    inputXOutQueue.FreeTensor(outLocal);
+    scaleOutQueue.FreeTensor(quantScaleLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyOut(int64_t progress)
+{
+    LocalTensor<int32_t> inLocal = copyInQueue.DeQue<int32_t>();
+    LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
+    int64_t length = Align(currentLoopRows, sizeof(int32_t));
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+    DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(this->cols * sizeof(int8_t)), 0, 0,
+                                  0};
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    if (this->lastExpertId == -1) {
+        this->lastExpertId = this->lastCoreExpertId;
+        this->tokenCount = this->lastCoreExpertIdNum;
+    }
+    for (int64_t idx = 0; idx < currentLoopRows; idx++) {
+        int32_t expertIdx = inLocal[length].GetValue(idx);
+        int32_t index = 0;
+        while (this->lastExpertId < expertIdx) {
+            while (this->tokenCount < this->expertCapacity) {
+                index = this->lastExpertId * this->expertCapacity + this->tokenCount;
+                DataCopyPad(expandedXGm[index * this->cols], this->outTmpLocal, copyParams1);
+                DataCopyPad(dynamicQuantScaleGm[index], this->scaleOutTmpLocal, quantScaleParams);
+                this->tokenCount++;
+            }
+            this->tokenCount = 0;
+            this->lastExpertId++;
+        }
+
+        if (this->tokenCount < this->expertCapacity) {
+            int32_t outOffset = inLocal.GetValue(idx);
+            index = expertIdx * this->expertCapacity + this->tokenCount;
+            outLocal.SetValue(0, index);
+            SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+            DataCopyPad(expandedRowIdxGm[outOffset], outLocal, copyParams);
+            Compute(outOffset, index, expertIdx);
+            this->tokenCount++;
+        }
+    }
+    copyInQueue.FreeTensor(inLocal);
+    copyOutQueue.FreeTensor(outLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline float MoeCustomSrcToDstAndGather<T, TilingData>::ComputeMax(LocalTensor<float> &inLocal,
+                                                                          LocalTensor<float> &tempLocal,
+                                                                          LocalTensor<float> &dynamicQuantLocal,
+                                                                          int32_t srcIdx, int32_t expertIdx, int64_t j)
+{
+    LocalTensor<float> smoothLocal = smoothInQueue.AllocTensor<float>();
+
+    DataCopyExtParams intriParamsT{1, static_cast<uint32_t>(colsTileLength * sizeof(T)), 0, 0, 0};
+    DataCopyExtParams intriParamsFp32{1, static_cast<uint32_t>(colsTileLength * sizeof(float)), 0, 0, 0};
+
+    if constexpr (!IsSameType<T, float>::value) {
+        DataCopyPad(inLocal.ReinterpretCast<T>()[perLoopColsAlign],
+                    inputXGm[srcIdx * this->cols + j * this->perLoopCols], intriParamsT, {false, 0, 0, 0});
+    } else {
+        DataCopyPad(inLocal, inputXGm[srcIdx * this->cols + j * this->perLoopCols], intriParamsT, {false, 0, 0, 0});
+    }
+
+    inputXInQueue.EnQue<float>(inLocal);
+    inLocal = inputXInQueue.DeQue<float>();
+
+    if constexpr (!IsSameType<T, float>::value) {
+        Cast(inLocal, inLocal.ReinterpretCast<T>()[perLoopColsAlign], RoundMode::CAST_NONE, colsTileLength);
+        PipeBarrier<PIPE_V>();
+    }
+
+    if (smoothType != NO_SCALE) {
+        DataCopyPad(smoothLocal, quantSmoothGm[expertIdx * this->cols + j * this->perLoopCols], intriParamsFp32,
+                    {false, 0, 0, 0});
+        smoothInQueue.EnQue(smoothLocal);
+        smoothLocal = smoothInQueue.DeQue<float>();
+
+        Mul(inLocal, inLocal, smoothLocal, colsTileLength);
+        PipeBarrier<PIPE_V>();
+    }
+
+    Abs(tempLocal, inLocal, colsTileLength);
+    PipeBarrier<PIPE_V>();
+
+    ReduceMax(dynamicQuantLocal[FP32_ONE_BLOCK_NUM], tempLocal, tempLocal, colsTileLength);
+
+    DataCopyPad(quantSrcGm[j * this->perLoopCols], inLocal, intriParamsFp32);
+    smoothInQueue.FreeTensor(smoothLocal);
+    SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+
+    return dynamicQuantLocal.GetValue(FP32_ONE_BLOCK_NUM);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::ComputeScale(LocalTensor<float> &inLocal,
+                                                                           LocalTensor<float> &tempLocal,
+                                                                           float scaleTemp, int64_t dstIndex, int64_t j)
+{
+    DataCopyExtParams copyInParams{1, static_cast<uint32_t>(colsTileLength * sizeof(float)), 0, 0, 0};
+    DataCopyExtParams copyOutParams{1, static_cast<uint32_t>(colsTileLength * sizeof(int8_t)), 0, 0, 0};
+
+    LocalTensor<int8_t> outLocal = inputXOutQueue.AllocTensor<int8_t>();
+
+    DataCopyPad(inLocal, quantSrcGm[j * this->perLoopCols], copyInParams, {false, 0, 0, 0});
+    inputXInQueue.EnQue<float>(inLocal);
+    inLocal = inputXInQueue.DeQue<float>();
+
+    Duplicate<float>(tempLocal, scaleTemp, colsTileLength);
+    PipeBarrier<PIPE_V>();
+
+    Div(tempLocal, inLocal, tempLocal, colsTileLength);
+    PipeBarrier<PIPE_V>();
+
+    Cast(tempLocal.ReinterpretCast<int32_t>(), tempLocal, RoundMode::CAST_RINT, colsTileLength);
+    PipeBarrier<PIPE_V>();
+    SetDeqScale((half)1.000000e+00f);
+    Cast(tempLocal.ReinterpretCast<half>(), tempLocal.ReinterpretCast<int32_t>(), RoundMode::CAST_ROUND,
+         colsTileLength);
+    PipeBarrier<PIPE_V>();
+    Cast(outLocal, tempLocal.ReinterpretCast<half>(), RoundMode::CAST_TRUNC, colsTileLength);
+
+    inputXOutQueue.EnQue(outLocal);
+    outLocal = inputXOutQueue.DeQue<int8_t>();
+    DataCopyPad(expandedXGm[dstIndex * this->cols + j * this->perLoopCols], outLocal, copyOutParams);
+
+    inputXOutQueue.FreeTensor(outLocal);
+    SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::ComputeLoops(int32_t srcIdx, int32_t dstIdx,
+                                                                           int32_t expertIdx)
+{
+    LocalTensor<float> inLocal = inputXInQueue.AllocTensor<float>();
+    LocalTensor<float> tempLocal = calcQueue.AllocTensor<float>();
+    LocalTensor<float> quantScaleLocal = scaleOutQueue.AllocTensor<float>();
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    uint32_t tmp = 0xFF7FFFFF;
+    float reduceMax = *((float *)&tmp);
+    for (int64_t j = 0; j < this->colLoops; j++) {
+        colsTileLength = this->perLoopCols;
+        if (j == this->colLoops - 1) {
+            colsTileLength = this->lastLoopCols;
+        }
+        float tileMax = ComputeMax(inLocal, tempLocal, quantScaleLocal, srcIdx / this->k, expertIdx, j);
+        reduceMax = (reduceMax > tileMax) ? reduceMax : tileMax;
+    }
+
+    float scaleTemp = reduceMax / 127.0f;
+    Duplicate<float>(quantScaleLocal, scaleTemp, 8);
+    scaleOutQueue.EnQue(quantScaleLocal);
+    quantScaleLocal = scaleOutQueue.DeQue<float>();
+
+    DataCopyPad(dynamicQuantScaleGm[dstIdx], quantScaleLocal, quantScaleParams);
+
+    for (int64_t j = 0; j < this->colLoops; j++) {
+        colsTileLength = this->perLoopCols;
+        if (j == this->colLoops - 1) {
+            colsTileLength = this->lastLoopCols;
+        }
+        ComputeScale(inLocal, tempLocal, scaleTemp, dstIdx, j);
+    }
+
+    inputXInQueue.FreeTensor(inLocal);
+    calcQueue.FreeTensor(tempLocal);
+    scaleOutQueue.FreeTensor(quantScaleLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyOutLoops(int64_t progress)
+{
+    LocalTensor<int32_t> inLocal = copyInQueue.DeQue<int32_t>();
+    LocalTensor<int32_t> outLocal = copyOutQueue.AllocTensor<int32_t>();
+    int64_t length = Align(currentLoopRows, sizeof(int32_t));
+    DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+
+    SetWaitFlag<HardEvent::MTE2_S>(HardEvent::MTE2_S);
+    if (this->lastExpertId == -1) {
+        this->lastExpertId = this->lastCoreExpertId;
+        this->tokenCount = this->lastCoreExpertIdNum;
+    }
+    for (int64_t idx = 0; idx < currentLoopRows; idx++) {
+        int32_t expertIdx = inLocal[length].GetValue(idx);
+        SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+        int32_t index = 0;
+        while (this->lastExpertId < expertIdx) {
+            while (this->tokenCount < this->expertCapacity) {
+                index = this->lastExpertId * this->expertCapacity + this->tokenCount;
+                int64_t col = this->perLoopCols;
+                DataCopyPad(dynamicQuantScaleGm[index], this->scaleOutTmpLocal, quantScaleParams);
+                for (int64_t i = 0; i < this->colLoops; i++) {
+                    if (i == this->colLoops - 1) {
+                        col = this->lastLoopCols;
+                    }
+                    DataCopyExtParams copyParams1{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(int8_t)),
+                                                  0, 0, 0};
+                    DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal,
+                                copyParams1);
+                }
+                this->tokenCount++;
+            }
+            this->tokenCount = 0;
+            this->lastExpertId++;
+        }
+
+        if (this->tokenCount < this->expertCapacity) {
+            int32_t outOffset = inLocal.GetValue(idx);
+            index = expertIdx * this->expertCapacity + this->tokenCount;
+            outLocal.SetValue(0, index);
+            SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+            DataCopyPad(expandedRowIdxGm[outOffset], outLocal, copyParams);
+            if (smoothType == SCALE_EH) {
+                ComputeLoops(outOffset, index, expertIdx);
+            } else {
+                ComputeLoops(outOffset, index, 0);
+            }
+            SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
+            this->tokenCount++;
+        }
+    }
+    copyInQueue.FreeTensor(inLocal);
+    copyOutQueue.FreeTensor(outLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::CopyOutRemain()
+{
+    DataCopyExtParams quantScaleParams{1, static_cast<uint32_t>(sizeof(int32_t)), 0, 0, 0};
+    if (this->blockIdx != this->srcToDstTilingData->needCoreNum - 1) {
+        copyOutZeroQueue.FreeTensor(this->outTmpLocal);
+        scaleOutZeroQueue.FreeTensor(this->scaleOutTmpLocal);
+        return;
+    }
+    while (this->lastExpertId < this->expertNum) {
+        while (this->tokenCount < this->expertCapacity) {
+            int32_t index = this->lastExpertId * this->expertCapacity + this->tokenCount;
+            int64_t col = this->perLoopCols;
+            DataCopyPad(dynamicQuantScaleGm[index], this->scaleOutTmpLocal, quantScaleParams);
+            for (int64_t i = 0; i < this->colLoops; i++) {
+                if (i == this->colLoops - 1) {
+                    col = this->lastLoopCols;
+                }
+                DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(col * sizeof(int8_t)), 0,
+                                             0, 0};
+                DataCopyPad(expandedXGm[index * this->cols + i * this->perLoopCols], this->outTmpLocal, copyParams);
+                SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
+            }
+            this->tokenCount++;
+        }
+        this->tokenCount = 0;
+        this->lastExpertId++;
+    }
+    copyOutZeroQueue.FreeTensor(this->outTmpLocal);
+    scaleOutZeroQueue.FreeTensor(this->scaleOutTmpLocal);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::Init(GM_ADDR x, GM_ADDR scale, GM_ADDR expandedRowIdx,
+                                                                   GM_ADDR expandedX, GM_ADDR dynamicQuantScale,
+                                                                   GM_ADDR workspace, const TilingData *tilingData,
+                                                                   TPipe *tPipe)
+{
+    int64_t blockNum = GetBlockNum();
+    this->pipe = tPipe;
+    this->blockIdx = GetBlockIdx();
+
+    this->coreNum = tilingData->coreNum;
+    this->totalLength = tilingData->n * tilingData->k;
+    this->srcToDstTilingData = &(tilingData->srcToDstDropPadDynamicParamsOp);
+    this->expertNum = tilingData->expertNum;
+    this->expertCapacity = tilingData->expertCapacity;
+    this->cols = tilingData->cols;
+    this->k = tilingData->k;
+    this->smoothType = tilingData->smoothType;
+
+    if (this->blockIdx == this->srcToDstTilingData->needCoreNum - 1) {
+        this->coreRows = this->srcToDstTilingData->lastCoreRows;
+        this->perLoopRows = this->srcToDstTilingData->lastCorePerLoopRows;
+        this->lastLoopRows = this->srcToDstTilingData->lastCoreLastLoopRows;
+        this->rowLoops = this->srcToDstTilingData->lastCoreLoops;
+    } else {
+        this->coreRows = this->srcToDstTilingData->perCoreRows;
+        this->perLoopRows = this->srcToDstTilingData->perCorePerLoopRows;
+        this->lastLoopRows = this->srcToDstTilingData->perCoreLastLoopRows;
+        this->rowLoops = this->srcToDstTilingData->perCoreLoops;
+    }
+    this->perLoopCols = this->srcToDstTilingData->perLoopCols;
+    this->lastLoopCols = this->srcToDstTilingData->lastLoopCols;
+    this->colLoops = this->srcToDstTilingData->colLoops;
+    this->perLoopColsAlign = Align(this->perLoopCols, sizeof(T));
+
+    inputXGm.SetGlobalBuffer((__gm__ T *)x);
+    quantSmoothGm.SetGlobalBuffer((__gm__ float *)scale);
+    dynamicQuantScaleGm.SetGlobalBuffer((__gm__ float *)dynamicQuantScale);
+
+    int64_t length = Align(this->totalLength, sizeof(int32_t));
+    expandedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expandedRowIdx, length);
+    expandedXGm.SetGlobalBuffer((__gm__ int8_t *)expandedX, this->expertNum * this->expertCapacity * this->cols);
+
+    expandedExpertIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                            this->blockIdx * this->srcToDstTilingData->perCoreRows,
+                                        Align(this->coreRows, sizeof(int32_t)));
+    expandDstToSrcRowGm.SetGlobalBuffer((__gm__ int32_t *)workspace + length +
+                                            this->blockIdx * this->srcToDstTilingData->perCoreRows,
+                                        Align(this->coreRows, sizeof(int32_t)));
+    expertIdxValueGm.SetGlobalBuffer(
+        (__gm__ int32_t *)workspace + length * 2 + Align(this->expertNum, sizeof(int32_t)) * 2, this->coreNum * 2);
+    if (this->colLoops > 1) {
+        quantSrcGm.SetGlobalBuffer((__gm__ float *)workspace + length * 2 +
+                                       Align(this->expertNum, sizeof(int32_t)) * 2 + this->coreNum * 2 +
+                                       this->blockIdx * this->cols,
+                                   this->cols * sizeof(float));
+    }
+
+    pipe->InitBuffer(copyInQueue, 1, AlignBytes(this->perLoopRows, sizeof(int32_t)) * 2);
+    pipe->InitBuffer(copyOutQueue, 1, AlignBytes(INT32_ONE_BLOCK_NUM, sizeof(int32_t)));
+    pipe->InitBuffer(copyOutZeroQueue, 1, AlignBytes(this->perLoopCols, sizeof(int16_t)));
+
+    int64_t perLoopColsAlignBytes = AlignBytes(this->perLoopCols, sizeof(T));
+    perLoopColsAlignBytes =
+        Max(int64_t(perLoopColsAlignBytes * sizeof(float) / sizeof(T)), int64_t(BLOCK_BYTES + BLOCK_BYTES));
+
+    pipe->InitBuffer(inputXInQueue, 1, perLoopColsAlignBytes);
+    pipe->InitBuffer(smoothInQueue, 1, AlignBytes(this->perLoopCols, sizeof(float)));
+    pipe->InitBuffer(calcQueue, 1, AlignBytes(this->perLoopCols, sizeof(float)));
+    pipe->InitBuffer(inputXOutQueue, 1, AlignBytes(this->perLoopCols, sizeof(int8_t)));
+    pipe->InitBuffer(scaleOutQueue, 1, BLOCK_BYTES + BLOCK_BYTES);
+    pipe->InitBuffer(scaleOutZeroQueue, 1, BLOCK_BYTES);
+}
+
+template <typename T, typename TilingData>
+__aicore__ inline void MoeCustomSrcToDstAndGather<T, TilingData>::Process()
+{
+    if (this->blockIdx < this->srcToDstTilingData->needCoreNum) {
+        AssistInit();
+        this->outTmpLocal = copyOutZeroQueue.DeQue<int8_t>();
+        this->scaleOutTmpLocal = scaleOutZeroQueue.DeQue<float>();
+        currentLoopRows = perLoopRows;
+        if (colLoops > 1) {
+            for (int64_t loop = 0; loop < this->rowLoops; loop++) {
+                if (loop == this->rowLoops - 1) {
+                    currentLoopRows = lastLoopRows;
+                }
+                CopyIn(loop);
+                CopyOutLoops(loop);
+            }
+        } else {
+            smoothLocal = smoothInQueue.AllocTensor<float>();
+            if (smoothType == SCALE_1H) {
+                DataCopyExtParams smoothParams{1, static_cast<uint32_t>(this->cols * sizeof(float)), 0, 0, 0};
+                DataCopyPad(smoothLocal, quantSmoothGm, smoothParams, {false, 0, 0, 0});
+            }
+            for (int64_t loop = 0; loop < this->rowLoops; loop++) {
+                if (loop == this->rowLoops - 1) {
+                    currentLoopRows = lastLoopRows;
+                }
+                CopyIn(loop);
+                CopyOut(loop);
+            }
+            smoothInQueue.FreeTensor(smoothLocal);
+        }
+        CopyOutRemain();
+    }
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_ROW_IDX_GATHER_DROPPAD_DYNAMIC_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_actual_expert.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_actual_expert.h
@@ -0,0 +1,430 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_sort_actual_expert.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_SORT_ACTUAL_EXPERT_H
+#define MOE_CUSTOM_SORT_ACTUAL_EXPERT_H
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+constexpr int64_t MULTI_GATHERED_SORT_CORE_NUM = 16;
+constexpr int64_t MULTI_GATHERED_SORT_THRSHOLD = 5632;
+constexpr int64_t SINGLE_GATHERED_BUFFER_NUM = 2;
+constexpr int64_t SINGLE_GATHERED_MAX_NUM = 21845;
+
+template <typename T>
+class MoeSortActualExpert {
+public:
+    __aicore__ inline MoeSortActualExpert(){};
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX, GM_ADDR expendedRowIdx,
+                                GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline bool Process();
+    __aicore__ inline void multiCoreGatheredSort();
+    __aicore__ inline void CopyOutExpandRowIdx();
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void SortCompute();
+    __aicore__ inline void TilingInKernel();
+    __aicore__ inline void ExpertCountCompute();
+    __aicore__ inline void CopyOut();
+    __aicore__ inline void CopyOutExpertCount();
+
+private:
+    TPipe *pipe;
+    TBuf<TPosition::VECCALC> buffer_;
+    TQueBind<TPosition::VECIN, TPosition::VECOUT, SINGLE_GATHERED_BUFFER_NUM> scaleCopyInQueue_;
+    TQue<TPosition::VECOUT, 1> sortedNumCopyOutQueue_;
+
+    GlobalTensor<T> xGm_;
+    GlobalTensor<float> scaleGm_;
+    GlobalTensor<T> expandedXGm_;
+    GlobalTensor<int64_t> expertTokensCountOrCumsumGm_;
+    GlobalTensor<float> expandedScaleGm_;
+    GlobalTensor<int32_t> expendedRowIdxGm_;
+    GlobalTensor<int32_t> expertIdxGm_;
+    GlobalTensor<int32_t> workspaceGm_;
+    GlobalTensor<float> workspaceExpertIdxGm_;
+    GlobalTensor<int32_t> workspaceGatheredSortNumGm_;
+    GlobalTensor<float> workspaceGatheredExpertIdxGm_;
+    GlobalTensor<int32_t> workspaceGatheredExpertIndexGm_;
+
+    int64_t expertIdxOffset_ = 0;
+    int64_t expertIndexOffset_ = 0;
+    int64_t compareScalarMaskOffset_ = 0;
+    int64_t compareScalarMask0Offset_ = 0;
+    int64_t compareScalarMask1Offset_ = 0;
+    int64_t gatherMaskOffset_ = 0;
+
+    int64_t totalLength_;
+    int64_t expertStart_ = 0;
+    int64_t expertEnd_ = 0;
+    int64_t actual_expert_num_ = 0;
+    int64_t cols_ = 0;
+    int64_t rowIdxType_ = 0;
+    int64_t isInputScale_ = 0;
+    int64_t k_ = 0;
+
+    int64_t needSortNum_ = 0;
+
+    int64_t needCoreNum_ = 0;
+    int64_t perCoreElements_ = 0;
+    int64_t lastCoreElements_ = 0;
+    int64_t curCoreElements_ = 0;
+    int64_t curCoreStartIndex_ = 0;
+
+    bool needMultiSort = false;
+
+    int64_t kvFactor = 2;
+
+    static constexpr int64_t DST_BLK_STRIDE = 1;
+    static constexpr int64_t DST_REP_STRIDE = 8;
+    static constexpr int64_t MASK_STRIDE = 64;
+};
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::CopyIn()
+{
+    LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
+                                     static_cast<uint32_t>(this->totalLength_ * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(expertIdx, expertIdxGm_, dataCopyParams, dataCopyPadParams);
+    SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::SortCompute()
+{
+    LocalTensor<int32_t> expertIdx = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
+    LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
+    LocalTensor<int32_t> gatheredExpertIdx = buffer_.Get<int32_t>();
+    LocalTensor<float> gatheredExpertIdxFp32 = gatheredExpertIdx.ReinterpretCast<float>();
+
+    Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, this->totalLength_);
+    PipeBarrier<PIPE_V>();
+    Muls(expertIdxFp32, expertIdxFp32, (float)-1, this->totalLength_);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<uint8_t> compareScalarMaskLocalTensor0 = buffer_.Get<uint8_t>()[compareScalarMask0Offset_];
+    LocalTensor<uint8_t> compareScalarMaskLocalTensor1 = buffer_.Get<uint8_t>()[compareScalarMask1Offset_];
+    LocalTensor<uint8_t> gatherMaskLocalTensor = buffer_.Get<uint8_t>()[gatherMaskOffset_];
+
+    AscendC::CompareScalar(
+        compareScalarMaskLocalTensor0, expertIdxFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::LE,
+        (this->totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+    PipeBarrier<PIPE_V>();
+
+    AscendC::CompareScalar(
+        compareScalarMaskLocalTensor1, expertIdxFp32, static_cast<float>(-expertEnd_), AscendC::CMPMODE::GT,
+        (this->totalLength_ + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+    PipeBarrier<PIPE_V>();
+    And(gatherMaskLocalTensor.ReinterpretCast<uint16_t>(), compareScalarMaskLocalTensor0.ReinterpretCast<uint16_t>(),
+        compareScalarMaskLocalTensor1.ReinterpretCast<uint16_t>(),
+        Ceil(this->totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE / kvFactor);
+    PipeBarrier<PIPE_V>();
+
+    uint64_t rsvdCnt = 0;
+    GatherMaskParams gatherMaskParams;
+    gatherMaskParams.repeatTimes = 1;
+    gatherMaskParams.src0BlockStride = 1;
+    gatherMaskParams.src0RepeatStride = 8;
+    gatherMaskParams.src1RepeatStride = 8;
+    GatherMask(gatheredExpertIdxFp32, expertIdxFp32, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
+               static_cast<uint32_t>(this->totalLength_), gatherMaskParams, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+    actual_expert_num_ = rsvdCnt;
+    // Handle actual_expert_num_ == 0
+    if (actual_expert_num_ < 1) {
+        return;
+    }
+    int64_t needSortNum = Ceil(static_cast<int64_t>(rsvdCnt), ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+    needSortNum_ = needSortNum;
+
+    LocalTensor<int32_t> expertIndex = buffer_.Get<int32_t>()[expertIdxOffset_ / sizeof(int32_t)];
+    LocalTensor<int32_t> gatheredExpertIndex = buffer_.Get<int32_t>()[needSortNum];
+    ArithProgression<int32_t>(expertIndex, 0, 1, this->totalLength_);
+    GatherMask(gatheredExpertIndex, expertIndex, gatherMaskLocalTensor.ReinterpretCast<uint32_t>(), true,
+               static_cast<uint32_t>(this->totalLength_), gatherMaskParams, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+    if (rsvdCnt > MULTI_GATHERED_SORT_THRSHOLD) {
+        if (GetBlockIdx() == 0) {
+            SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
+            DataCopyExtParams copyParams{1, static_cast<uint32_t>(rsvdCnt * sizeof(int32_t)), 0, 0, 0};
+            DataCopyPad(workspaceGatheredExpertIdxGm_, gatheredExpertIdxFp32, copyParams);
+            DataCopyPad(workspaceGatheredExpertIndexGm_, gatheredExpertIndex, copyParams);
+        }
+        needMultiSort = true;
+        return;
+    }
+    int64_t duplicateNum = rsvdCnt % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = rsvdCnt - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(gatheredExpertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+    }
+
+    PipeBarrier<PIPE_V>();
+    LocalTensor<float> concatLocal;
+    LocalTensor<float> sortTempTensor = buffer_.Get<float>()[needSortNum * kvFactor];
+    Concat(concatLocal, gatheredExpertIdxFp32, sortTempTensor, needSortNum / ONE_REPEAT_SORT_NUM);
+    LocalTensor<float> sortedLocal = buffer_.Get<float>()[needSortNum * kvFactor + needSortNum * kvFactor * kvFactor];
+    Sort<float, true>(sortedLocal, concatLocal, gatheredExpertIndex.ReinterpretCast<uint32_t>(), sortTempTensor,
+                      needSortNum / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+    LocalTensor<float> sortedExpertIdx = gatheredExpertIdxFp32;
+    LocalTensor<int32_t> sortedExpertIndex = gatheredExpertIndex.ReinterpretCast<int32_t>();
+
+    Extract(sortedExpertIdx, sortedExpertIndex.ReinterpretCast<uint32_t>(), sortedLocal,
+            needSortNum / ONE_REPEAT_SORT_NUM);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<int32_t> sortedExpertIdxInt32 = sortedExpertIdx.ReinterpretCast<int32_t>();
+
+    Muls(sortedExpertIdx, sortedExpertIdx, (float)-1, rsvdCnt);
+    Cast(sortedExpertIdxInt32, sortedExpertIdx, RoundMode::CAST_ROUND, rsvdCnt);
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::TilingInKernel()
+{
+    int64_t coreNum = needMultiSort ? MULTI_GATHERED_SORT_CORE_NUM : GetBlockNum();
+    perCoreElements_ = Ceil(actual_expert_num_, coreNum);
+    needCoreNum_ = Ceil(actual_expert_num_, perCoreElements_);
+    lastCoreElements_ = actual_expert_num_ - (needCoreNum_ - 1) * perCoreElements_;
+    if (GetBlockIdx() == needCoreNum_ - 1) {
+        curCoreElements_ = lastCoreElements_;
+    } else {
+        curCoreElements_ = perCoreElements_;
+    }
+    curCoreStartIndex_ = GetBlockIdx() * perCoreElements_;
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::multiCoreGatheredSort()
+{
+    needSortNum_ = Ceil(static_cast<int64_t>(curCoreElements_), ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+    perCoreElements_ = Ceil(this->totalLength_, MULTI_GATHERED_SORT_CORE_NUM);
+
+    LocalTensor<int32_t> sortedNumOutLocal = sortedNumCopyOutQueue_.AllocTensor<int32_t>();
+    LocalTensor<float> gatheredExpertIdxFp32 = buffer_.Get<float>();
+    LocalTensor<int32_t> gatheredExpertIndex = buffer_.Get<int32_t>()[needSortNum_];
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(curCoreElements_ * sizeof(float)),
+                                     0, 0, 0};
+    DataCopyPadExtParams<float> expertIdxPadParams{false, 0, 0, 0};
+    DataCopyPad(gatheredExpertIdxFp32, workspaceGatheredExpertIdxGm_[curCoreStartIndex_], dataCopyParams,
+                expertIdxPadParams);
+    DataCopyPadExtParams<int32_t> expertIndexPadParams{false, 0, 0, 0};
+    DataCopyPad(gatheredExpertIndex, workspaceGatheredExpertIndexGm_[curCoreStartIndex_], dataCopyParams,
+                expertIndexPadParams);
+    SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
+
+    LocalTensor<float> concatLocal;
+    LocalTensor<float> sortTempTensor = buffer_.Get<float>()[needSortNum_ * kvFactor];
+    // Duplicate MIN_FP32
+    int64_t duplicateNum = curCoreElements_ % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = curCoreElements_ - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(gatheredExpertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+    }
+    Concat(concatLocal, gatheredExpertIdxFp32, sortTempTensor, needSortNum_ / ONE_REPEAT_SORT_NUM);
+    LocalTensor<float> sortedLocal = buffer_.Get<float>()[needSortNum_ * kvFactor + needSortNum_ * kvFactor * kvFactor];
+    Sort<float, true>(sortedLocal, concatLocal, gatheredExpertIndex.ReinterpretCast<uint32_t>(), sortTempTensor,
+                      needSortNum_ / ONE_REPEAT_SORT_NUM);
+
+    // Copy out sortedLocal for MergeSort
+    SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
+    int64_t curCoreSortedStartIndex = kvFactor * GetBlockIdx() * perCoreElements_;
+    dataCopyParams.blockLen = static_cast<uint32_t>(kvFactor * curCoreElements_ * sizeof(float));
+    DataCopyPad(workspaceExpertIdxGm_[curCoreSortedStartIndex], sortedLocal, dataCopyParams);
+    // Copyout sortedNum
+    sortedNumOutLocal.SetValue(0, curCoreElements_);
+    SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+    dataCopyParams.blockLen = static_cast<uint32_t>(sizeof(int32_t));
+    DataCopyPad(workspaceGatheredSortNumGm_[GetBlockIdx()], sortedNumOutLocal, dataCopyParams);
+    sortedNumCopyOutQueue_.FreeTensor(sortedNumOutLocal);
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::CopyOutExpandRowIdx()
+{
+    LocalTensor<int32_t> sortedExpertIndex = buffer_.Get<int32_t>()[needSortNum_];
+    SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
+    if (GetBlockIdx() == 0) {
+        DataCopyExtParams copyParams{1, static_cast<uint32_t>(actual_expert_num_ * sizeof(int32_t)), 0, 0, 0};
+        DataCopyPad(expendedRowIdxGm_, sortedExpertIndex, copyParams);
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::ExpertCountCompute()
+{
+    LocalTensor<int32_t> sortedExpertIdx = buffer_.Get<int32_t>()[curCoreStartIndex_];
+    LocalTensor<int32_t> expertCountLocalTensor = buffer_.Get<int32_t>()[needSortNum_ * kvFactor];
+    Duplicate(expertCountLocalTensor, 0, expertEnd_ - expertStart_);
+
+    for (int64_t i = 0; i < curCoreElements_; i++) {
+        int64_t expertIdx = sortedExpertIdx.GetValue(i) - expertStart_;
+        int32_t curExpertCount = expertCountLocalTensor.GetValue(expertIdx);
+        expertCountLocalTensor.SetValue(expertIdx, curExpertCount + 1);
+    }
+    SetWaitFlag<HardEvent::S_MTE3>(HardEvent::S_MTE3);
+    DataCopyExtParams copyOutParams1{1, static_cast<uint32_t>((expertEnd_ - expertStart_) * sizeof(int32_t)), 0, 0, 0};
+    SetAtomicAdd<int32_t>();
+    DataCopyPad(workspaceGm_, expertCountLocalTensor, copyOutParams1);
+    SetAtomicNone();
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::CopyOut()
+{
+    LocalTensor<int32_t> sortedExpertIndex = buffer_.Get<int32_t>()[needSortNum_ + curCoreStartIndex_];
+    int64_t xLocalOffset = (needSortNum_ * kvFactor + ASSIST_NUM) * sizeof(int32_t) / sizeof(T);
+    LocalTensor<T> xLocalTensor = buffer_.Get<T>()[xLocalOffset];
+
+    for (int64_t i = 0; i < curCoreElements_; i++) {
+        int64_t srcRow = sortedExpertIndex.GetValue(i) / k_;
+        int64_t dstRow = i + curCoreStartIndex_;
+        SetWaitFlag<HardEvent::S_MTE2>(HardEvent::S_MTE2);
+
+        LocalTensor<float> scaleLocalTensor;
+        DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(cols_ * sizeof(T)), 0, 0, 0};
+        DataCopyPadExtParams<T> dataCopyPadParams{false, 0, 0, 0};
+        DataCopyPad(xLocalTensor, xGm_[srcRow * cols_], dataCopyParams, dataCopyPadParams);
+        if (isInputScale_ == 1) {
+            scaleLocalTensor = scaleCopyInQueue_.AllocTensor<float>();
+            DataCopyExtParams dataCopyParams2{static_cast<uint16_t>(1), static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+            DataCopyPadExtParams<float> dataCopyPadParams2{false, 0, 0, 0};
+            DataCopyPad(scaleLocalTensor, scaleGm_[srcRow], dataCopyParams2, dataCopyPadParams2);
+            scaleCopyInQueue_.EnQue<float>(scaleLocalTensor);
+        }
+        SetWaitFlag<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
+        DataCopyExtParams copyOutParams1{1, static_cast<uint32_t>(cols_ * sizeof(T)), 0, 0, 0};
+        DataCopyPad(expandedXGm_[dstRow * cols_], xLocalTensor, copyOutParams1);
+        if (isInputScale_ == 1) {
+            scaleLocalTensor = scaleCopyInQueue_.DeQue<float>();
+            DataCopyExtParams copyOutParams2{1, static_cast<uint32_t>(sizeof(float)), 0, 0, 0};
+            DataCopyPad(expandedScaleGm_[dstRow], scaleLocalTensor, copyOutParams2);
+            scaleCopyInQueue_.FreeTensor(scaleLocalTensor);
+        }
+    }
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::CopyOutExpertCount()
+{
+    LocalTensor<int32_t> expertCountLocalTensor = buffer_.Get<int32_t>()[needSortNum_ * kvFactor];
+    LocalTensor<int64_t> expertCountLocalTensorInt64 =
+        buffer_.Get<int32_t>()[needSortNum_ * kvFactor + ASSIST_NUM].ReinterpretCast<int64_t>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
+                                     static_cast<uint32_t>((expertEnd_ - expertStart_) * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(expertCountLocalTensor, workspaceGm_, dataCopyParams, dataCopyPadParams);
+    SetWaitFlag<HardEvent::MTE2_V>(HardEvent::MTE2_V);
+    Cast(expertCountLocalTensorInt64, expertCountLocalTensor, RoundMode::CAST_NONE, (expertEnd_ - expertStart_));
+    SetWaitFlag<HardEvent::V_MTE3>(HardEvent::V_MTE3);
+    DataCopyExtParams copyOutParams1{1, static_cast<uint32_t>((expertEnd_ - expertStart_) * sizeof(int64_t)), 0, 0, 0};
+    DataCopyPad(expertTokensCountOrCumsumGm_, expertCountLocalTensorInt64, copyOutParams1);
+}
+
+template <typename T>
+__aicore__ inline void MoeSortActualExpert<T>::Init(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR expandedX,
+                                                    GM_ADDR expendedRowIdx, GM_ADDR expertTokensCountOrCumsum,
+                                                    GM_ADDR expandedScale, GM_ADDR workspace,
+                                                    const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    this->pipe = tPipe;
+    this->totalLength_ = tilingData->n * tilingData->k;
+    cols_ = tilingData->cols;
+    expertStart_ = tilingData->expertStart;
+    expertEnd_ = tilingData->expertEnd;
+    rowIdxType_ = tilingData->rowIdxType;
+    isInputScale_ = tilingData->isInputScale;
+    k_ = tilingData->k;
+
+    expertIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expertIdx);
+
+    expendedRowIdxGm_.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx);
+
+    xGm_.SetGlobalBuffer((__gm__ T *)x);
+    scaleGm_.SetGlobalBuffer((__gm__ float *)scale);
+    expandedXGm_.SetGlobalBuffer((__gm__ T *)expandedX);
+    expertTokensCountOrCumsumGm_.SetGlobalBuffer((__gm__ int64_t *)expertTokensCountOrCumsum);
+    expandedScaleGm_.SetGlobalBuffer((__gm__ float *)expandedScale);
+    workspaceGm_.SetGlobalBuffer((__gm__ int32_t *)workspace, ASSIST_NUM);
+    if (GetBlockIdx() == 0) {
+        InitGlobalMemory(workspaceGm_, ASSIST_NUM, 0);
+        SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+    }
+    workspaceExpertIdxGm_.SetGlobalBuffer((__gm__ float *)workspace);
+    int64_t offset = kvFactor * Align(this->totalLength_, sizeof(int32_t));
+    workspaceGatheredExpertIdxGm_.SetGlobalBuffer((__gm__ float *)workspace + offset);
+    offset += Align(this->totalLength_, sizeof(float));
+    workspaceGatheredExpertIndexGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + offset);
+    offset += Align(this->totalLength_, sizeof(float));
+    workspaceGatheredSortNumGm_.SetGlobalBuffer((__gm__ int32_t *)workspace + offset);
+
+    expertIdxOffset_ = AlignBytes(this->totalLength_, sizeof(int32_t));
+    expertIndexOffset_ = expertIdxOffset_;
+
+    gatherMaskOffset_ = expertIdxOffset_ * kvFactor;
+    int64_t maskOffset =
+        AlignBytes(Ceil(this->totalLength_, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE, sizeof(int8_t));
+    compareScalarMask0Offset_ = gatherMaskOffset_ + maskOffset;
+    compareScalarMask1Offset_ = compareScalarMask0Offset_ + maskOffset;
+    int64_t maskOffsetMax = Ceil(SINGLE_GATHERED_MAX_NUM, MASK_STRIDE) * MASK_STRIDE / DST_REP_STRIDE;
+    int64_t bufferSize =
+        AlignBytes(SINGLE_GATHERED_MAX_NUM, sizeof(int32_t)) * kvFactor + maskOffsetMax + maskOffsetMax + maskOffsetMax;
+    pipe->InitBuffer(scaleCopyInQueue_, SINGLE_GATHERED_BUFFER_NUM, 32);
+    pipe->InitBuffer(sortedNumCopyOutQueue_, SINGLE_GATHERED_BUFFER_NUM, 32);
+    pipe->InitBuffer(buffer_, bufferSize); // 182992 Bytes
+}
+
+template <typename T>
+__aicore__ inline bool MoeSortActualExpert<T>::Process()
+{
+    CopyIn();
+    SortCompute();
+    TilingInKernel();
+    if (needMultiSort) {
+        SyncAll();
+        if (GetBlockIdx() < needCoreNum_) {
+            multiCoreGatheredSort();
+        }
+        SyncAll();
+        return false;
+    }
+
+    if (GetBlockIdx() < needCoreNum_) {
+        CopyOutExpandRowIdx();
+    }
+    if (GetBlockIdx() < needCoreNum_) {
+        ExpertCountCompute();
+        CopyOut();
+    }
+    SyncAll();
+    if (GetBlockIdx() == GetBlockNum() - 1) {
+        CopyOutExpertCount();
+    }
+    return true;
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_SORT_ACTUAL_EXPERT_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_base.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_base.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_sort_base.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_SORT_BASE_H
+#define MOE_CUSTOM_SORT_BASE_H
+
+#include "kernel_operator.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+class MoeSortBase {
+public:
+    __aicore__ inline MoeSortBase(){};
+    __aicore__ inline int64_t GetSyncRound();
+
+protected:
+    __aicore__ inline void CleanWSCache();
+    __aicore__ inline void SyncAll();
+
+protected:
+    TPipe *pipe;
+    TQue<QuePosition::VECIN, 1> sortDataCopyInQueue;
+    TQue<QuePosition::VECOUT, 1> sortDataCopyOutQueue;
+    TBuf<TPosition::VECCALC> tempBuffer;
+    TBuf<TPosition::VECCALC> sortedBuffer;
+
+    GlobalTensor<int32_t> expertIdxGm;
+    GlobalTensor<int32_t> expendedRowIdxGm;
+    GlobalTensor<int32_t> sortedExpertForSourceRowGm;
+    GlobalTensor<int32_t> expandDstToSrcRowGm;
+    GlobalTensor<int32_t> sortedexpertIdxGm;
+    GlobalTensor<int32_t> expertCountTempGm;
+
+    int64_t tileLength;
+    int64_t bufferNum = 1;
+    int64_t totalLength;
+    int64_t coreNum;
+
+    int64_t expertStart_ = 0;
+    int64_t expertEnd_ = 0;
+    int64_t n;
+    int64_t k;
+    int64_t ep_ = 0;
+    int64_t oneLoopMaxElements_;
+    int64_t rowIdxType_ = 0;
+
+    static constexpr int64_t SYNC_GM_NUM = 2;
+    static constexpr int64_t WORK_GM_NUM = 2;
+    static constexpr int64_t DST_BLK_STRIDE = 1;
+    static constexpr int64_t DST_REP_STRIDE = 8;
+};
+
+__aicore__ inline void MoeSortBase::SyncAll()
+{
+    AscendC::SyncAll();
+}
+
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_SORT_BASE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_multi_core.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_multi_core.h
@@ -0,0 +1,377 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_sort_multi_core.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_VBS_ONE_CORE_H
+#define MOE_CUSTOM_VBS_ONE_CORE_H
+
+#include "moe_custom_sort_base.h"
+#include "moe_custom_mrgsort.h"
+#include "moe_custom_mrgsort_out.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+class MoeSortMultiCore : public MoeSortBase {
+public:
+    __aicore__ inline MoeSortMultiCore(){};
+    __aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void VBSProcess();
+    __aicore__ inline void UBSortProcess(int64_t progress, int64_t size, int64_t sortNum);
+    __aicore__ inline void OneCoreVMSProcess(int64_t listNum, int64_t perListElements, int64_t lastListElements);
+    __aicore__ inline void VMSProcess();
+    __aicore__ inline void SortOutProcess();
+    __aicore__ inline void VBSCopyIn(int64_t progress, int64_t size, int64_t sortNum);
+    __aicore__ inline void UBSortCompute(int64_t progress, int64_t size, int64_t sortNum);
+    __aicore__ inline void VBSCopyOut(int64_t progress, int64_t size, int64_t sortNum);
+    __aicore__ inline void InitMoeMrgSort(MoeMrgsort *sorter, int64_t listNum, int64_t coreOffset, int64_t loopOffset);
+    __aicore__ inline void InitMoeMrgSortOut(MoeMrgsortOut *sorter, int64_t listNum, int64_t coreOffset);
+
+private:
+    GlobalTensor<float> workspaceGms[2];
+    // GlobalTensor<int64_t> expertTokensCountGm_;
+
+    const MoeCustomVBSComputeTilingData *vbsTilingData;
+    const MoeCustomVMSMiddleComputeTilingData *vmsTilingData;
+    const MoeCustomSortOutComputeTilingData *sortOutTilingData;
+
+    // for MoeMrgsort
+    MoeMrgsort mrgsorter;
+    MoeMrgsortParam mrgsortParam;
+
+    int64_t coreNum;
+    int64_t blockIdx;
+    int64_t srcWsIndex = 0;
+
+    int64_t listNum;
+    int64_t perListElements;
+    int64_t lastListElements;
+
+    int64_t sortTotalLength;
+    int64_t sortCoreLoops;
+    int64_t sortCoreLoopElements;
+    int64_t sortCoreLastLoopElements;
+
+    int64_t perCoreExpert;
+    int64_t needInitExpertCore;
+    int64_t currentCoreExpert;
+
+    static constexpr int64_t MAX_MRGSORT_LIST = 4;
+};
+
+__aicore__ inline void MoeSortMultiCore::VBSCopyIn(int64_t progress, int64_t size, int64_t sortNum)
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue.AllocTensor<int32_t>();
+    int64_t inOffset = progress * sortCoreLoopElements;
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(size * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams<int32_t> dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(inLocal[0], expertIdxGm[inOffset], dataCopyParams, dataCopyPadParams);
+
+    LocalTensor<int32_t> rowIdxLocal = inLocal[sortNum];
+    int64_t startValue = this->blockIdx * this->vbsTilingData->perCoreElements + inOffset;
+    SetWaitFlag<HardEvent::MTE3_S>(HardEvent::MTE3_S);
+    ArithProgression<int32_t>(rowIdxLocal, startValue, 1, size);
+    sortDataCopyInQueue.EnQue(inLocal);
+}
+
+__aicore__ inline void MoeSortMultiCore::UBSortCompute(int64_t progress, int64_t size, int64_t sortNum)
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue.DeQue<int32_t>();
+    LocalTensor<int32_t> expertForSourceRowLocal = inLocal[0];
+    LocalTensor<float> expertForSourceRowLocalFp32;
+
+    expertForSourceRowLocalFp32 = expertForSourceRowLocal.ReinterpretCast<float>();
+    Cast(expertForSourceRowLocalFp32, expertForSourceRowLocal, RoundMode::CAST_ROUND, sortNum);
+
+    Muls(expertForSourceRowLocalFp32, expertForSourceRowLocalFp32, (float)-1, sortNum);
+
+    if (ep_) {
+        LocalTensor<uint8_t> maskLocalTensor = sortedBuffer.Get<uint8_t>();
+        AscendC::CompareScalar(
+            maskLocalTensor, expertForSourceRowLocalFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::GT,
+            (sortNum + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM * ONE_REPEAT_COMPARE_NUM);
+        LocalTensor<float> floatMinLocalTensor = tempBuffer.Get<float>();
+        Duplicate(floatMinLocalTensor, MIN_FP32, sortNum);
+        Select(expertForSourceRowLocalFp32, maskLocalTensor, floatMinLocalTensor, expertForSourceRowLocalFp32,
+               SELMODE::VSEL_TENSOR_TENSOR_MODE, sortNum);
+    }
+
+    int64_t duplicateNum = size % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = size - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(expertForSourceRowLocalFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+    }
+
+    LocalTensor<float> concatLocal = expertForSourceRowLocalFp32;
+    LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(sortNum));
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
+    LocalTensor<uint32_t> sourceRowLocal;
+    sourceRowLocal = inLocal[sortNum].ReinterpretCast<uint32_t>();
+    Sort<float, true>(outLocal, concatLocal, sourceRowLocal, sortedLocal, sortNum / ONE_REPEAT_SORT_NUM);
+
+    sortDataCopyOutQueue.EnQue<float>(outLocal);
+    sortDataCopyInQueue.FreeTensor(inLocal);
+}
+
+__aicore__ inline void MoeSortMultiCore::VBSCopyOut(int64_t progress, int64_t size, int64_t sortNum)
+{
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.DeQue<float>();
+    DataCopy(workspaceGms[0][this->blockIdx * GetSortLen<float>(this->vbsTilingData->perCoreElements) +
+                             GetSortLen<float>(progress * sortCoreLoopElements)],
+             outLocal, Align(GetSortLen<float>(size), sizeof(float)));
+    sortDataCopyOutQueue.FreeTensor(outLocal);
+}
+
+__aicore__ inline void MoeSortMultiCore::InitMoeMrgSort(MoeMrgsort *sorter, int64_t listNum, int64_t coreOffset,
+                                                        int64_t loopOffset)
+{
+    GlobalTensor<float> srcWsGm = workspaceGms[srcWsIndex][blockIdx * coreOffset + loopOffset];
+    LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
+    for (int64_t i = 0; i < listNum; i++) {
+        LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(oneLoopMaxElements_) * i];
+        sorter->SetInput(srcWsGm, inLocalT);
+    }
+    GlobalTensor<float> dstWsGm = workspaceGms[1 - srcWsIndex][blockIdx * coreOffset + loopOffset];
+    sorter->SetOutput(dstWsGm, outLocal);
+    sortDataCopyInQueue.FreeTensor(inLocal);
+    sortDataCopyOutQueue.FreeTensor(outLocal);
+}
+
+__aicore__ inline void MoeSortMultiCore::InitMoeMrgSortOut(MoeMrgsortOut *sorter, int64_t listNum, int64_t coreOffset)
+{
+    GlobalTensor<float> srcWsGm = workspaceGms[srcWsIndex];
+    LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
+
+    for (int64_t i = 0; i < listNum; i++) {
+        LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(oneLoopMaxElements_) * i];
+        sorter->SetInput(srcWsGm, inLocalT);
+    }
+
+    LocalTensor<float> outLocalV = outLocal[oneLoopMaxElements_ * MAX_MRGSORT_LIST];
+    sorter->SetOutput(this->sortedexpertIdxGm, this->expendedRowIdxGm, outLocal, outLocalV);
+
+    LocalTensor<float> tempBuffer = sortedBuffer.Get<float>(GetSortLen<float>(oneLoopMaxElements_) * MAX_MRGSORT_LIST);
+    sorter->SetBuffer(tempBuffer);
+    sortDataCopyInQueue.FreeTensor(inLocal);
+    sortDataCopyOutQueue.FreeTensor(outLocal);
+}
+
+__aicore__ inline void MoeSortMultiCore::OneCoreVMSProcess(int64_t listNum, int64_t perListElements,
+                                                           int64_t lastListElements)
+{
+    int64_t coreOffset = GetSortLen<float>(this->vbsTilingData->perCoreElements);
+    mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
+
+    for (int64_t i = 0; listNum >= 1; i++) {
+        int64_t loops = (listNum + MAX_MRGSORT_LIST - 1) / MAX_MRGSORT_LIST;
+        int64_t remainListNum = listNum - (loops - 1) * MAX_MRGSORT_LIST;
+
+        mrgsortParam.perListElements = perListElements;
+        mrgsortParam.lastListElements = perListElements;
+
+        int64_t loopOffset = GetSortLen<float>(mrgsortParam.perListElements * MAX_MRGSORT_LIST);
+        for (int64_t loop = 0; loop < loops - 1; loop++) {
+            InitMoeMrgSort(&mrgsorter, MAX_MRGSORT_LIST, coreOffset, loop * loopOffset);
+            mrgsorter.Init(&mrgsortParam);
+            mrgsorter.Process();
+        }
+
+        mrgsortParam.perListElements = perListElements;
+        mrgsortParam.lastListElements = lastListElements;
+        InitMoeMrgSort(&mrgsorter, remainListNum, coreOffset, (loops - 1) * loopOffset);
+        mrgsorter.Init(&mrgsortParam);
+        mrgsorter.Process();
+
+        listNum = loops;
+        lastListElements = perListElements * (remainListNum - 1) + lastListElements;
+        perListElements = perListElements * MAX_MRGSORT_LIST;
+        srcWsIndex = (srcWsIndex + 1) % WORK_GM_NUM;
+        if (loops == 1) {
+            break;
+        }
+    }
+}
+
+__aicore__ inline void MoeSortMultiCore::UBSortProcess(int64_t progress, int64_t size, int64_t sortNum)
+{
+    VBSCopyIn(progress, size, sortNum);
+    UBSortCompute(progress, size, sortNum);
+    VBSCopyOut(progress, size, sortNum);
+}
+
+__aicore__ inline void MoeSortMultiCore::VBSProcess()
+{
+    if (this->blockIdx < this->vbsTilingData->needCoreNum) {
+        int64_t sortNum = Ceil(sortCoreLoopElements, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+        for (int64_t loop = 0; loop < sortCoreLoops - 1; loop++) {
+            UBSortProcess(loop, sortCoreLoopElements, sortNum);
+        }
+
+        sortNum = Ceil(sortCoreLastLoopElements, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+        UBSortProcess(sortCoreLoops - 1, sortCoreLastLoopElements, sortNum);
+
+        if (sortCoreLoops > 1) {
+            OneCoreVMSProcess(sortCoreLoops, sortCoreLoopElements, sortCoreLastLoopElements);
+        }
+    }
+    SyncAll();
+}
+
+__aicore__ inline void MoeSortMultiCore::VMSProcess()
+{
+    int64_t currentStageNeedCoreNum = this->vmsTilingData->needCoreNum;
+    perListElements = this->vbsTilingData->perCoreElements;
+    lastListElements = this->vbsTilingData->lastCoreElements;
+    listNum = this->vbsTilingData->needCoreNum;
+
+    for (; listNum > MAX_MRGSORT_LIST;) {
+        currentStageNeedCoreNum = Ceil(listNum, MAX_MRGSORT_LIST);
+        int64_t coreOffset = GetSortLen<float>(perListElements * MAX_MRGSORT_LIST);
+        int64_t remainListNum = listNum - (currentStageNeedCoreNum - 1) * MAX_MRGSORT_LIST;
+
+        if (this->blockIdx < currentStageNeedCoreNum - 1) {
+            mrgsortParam.perListElements = perListElements;
+            mrgsortParam.lastListElements = perListElements;
+            mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
+            InitMoeMrgSort(&mrgsorter, MAX_MRGSORT_LIST, coreOffset, 0);
+            mrgsorter.Init(&mrgsortParam);
+            mrgsorter.Process();
+        } else if (this->blockIdx == currentStageNeedCoreNum - 1) {
+            mrgsortParam.perListElements = perListElements;
+            mrgsortParam.lastListElements = lastListElements;
+            mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
+            InitMoeMrgSort(&mrgsorter, remainListNum, coreOffset, 0);
+            mrgsorter.Init(&mrgsortParam);
+            mrgsorter.Process();
+        }
+        listNum = currentStageNeedCoreNum;
+        currentStageNeedCoreNum = Ceil(listNum, MAX_MRGSORT_LIST);
+        srcWsIndex = (srcWsIndex + 1) % WORK_GM_NUM;
+
+        lastListElements = perListElements * (remainListNum - 1) + lastListElements;
+        perListElements = perListElements * MAX_MRGSORT_LIST;
+
+        SyncAll();
+    }
+}
+
+__aicore__ inline void MoeSortMultiCore::SortOutProcess()
+{
+    if (this->blockIdx < 1) {
+        mrgsortParam.perListElements = perListElements;
+        mrgsortParam.lastListElements = lastListElements;
+        mrgsortParam.oneLoopMaxElements = oneLoopMaxElements_;
+
+        MoeMrgsortOut sorter;
+        InitMoeMrgSortOut(&sorter, listNum, GetSortLen<float>(perListElements));
+        sorter.Init(&mrgsortParam, pipe);
+        sorter.Process();
+    }
+    SyncAll();
+}
+
+__aicore__ inline void MoeSortMultiCore::Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
+                                              const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    this->totalLength = tilingData->n * tilingData->k;
+    this->coreNum = tilingData->coreNum;
+    this->vbsTilingData = &(tilingData->vbsComputeParamsOp);
+    this->vmsTilingData = &(tilingData->vmsMiddleComputeParamsOp);
+    this->sortOutTilingData = &(tilingData->sortOutComputeParamsOp);
+
+    this->blockIdx = GetBlockIdx();
+    this->tileLength = this->vbsTilingData->perCorePerLoopElements;
+    this->sortTotalLength = this->vbsTilingData->perCoreElements;
+    if (this->blockIdx == tilingData->vbsComputeParamsOp.needCoreNum - 1) {
+        this->tileLength = this->vbsTilingData->lastCorePerLoopElements;
+        this->sortTotalLength = this->vbsTilingData->lastCoreElements;
+    }
+    this->n = tilingData->n;
+    this->k = tilingData->k;
+    this->ep_ = tilingData->ep;
+    this->oneLoopMaxElements_ = ep_ ? this->sortOutTilingData->oneLoopMaxElements : MRGSORT_LIST_MAX_ELEMENT;
+
+    expertStart_ = tilingData->expertStart;
+    expertEnd_ = tilingData->expertEnd;
+    rowIdxType_ = tilingData->rowIdxType;
+
+    // VBS param init
+    if (this->blockIdx == this->vbsTilingData->needCoreNum - 1) {
+        sortCoreLoops = this->vbsTilingData->lastCoreLoops;
+        sortCoreLoopElements = this->vbsTilingData->lastCorePerLoopElements;
+        sortCoreLastLoopElements = this->vbsTilingData->lastCoreLastLoopElements;
+    } else {
+        sortCoreLoops = this->vbsTilingData->perCoreLoops;
+        sortCoreLoopElements = this->vbsTilingData->perCorePerLoopElements;
+        sortCoreLastLoopElements = this->vbsTilingData->perCoreLastLoopElements;
+    }
+
+    this->pipe = tPipe;
+    expertIdxGm.SetGlobalBuffer((__gm__ int32_t *)expertIdx +
+                                    this->blockIdx * tilingData->vbsComputeParamsOp.perCoreElements,
+                                this->sortTotalLength);
+    sortedexpertIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(workspace),
+                                      Align(this->totalLength, sizeof(int32_t)));
+    if (rowIdxType_ == SCATTER) {
+        expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx, Align(this->totalLength, sizeof(int32_t)));
+    } else {
+        expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->totalLength, sizeof(int32_t)),
+                                         Align(this->totalLength, sizeof(int32_t)));
+    }
+
+    if (GetBlockIdx() == 0) {
+        expertCountTempGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                              Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2,
+                                          tilingData->actualExpertNum);
+        InitGlobalMemory(expertCountTempGm, tilingData->actualExpertNum, 0);
+        SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+    }
+
+    // key and value
+    int64_t kvFactor = 2;
+    workspaceGms[0].SetGlobalBuffer((__gm__ float *)workspace + Align(this->totalLength, sizeof(int32_t)) * 2 +
+                                        tilingData->actualExpertNum,
+                                    Align(this->totalLength, sizeof(int32_t)) * kvFactor);
+    workspaceGms[1].SetGlobalBuffer((__gm__ float *)workspace +
+                                        Align(this->totalLength, sizeof(int32_t)) * (kvFactor + 2) +
+                                        tilingData->actualExpertNum,
+                                    Align(this->totalLength, sizeof(int32_t)) * kvFactor);
+
+    int64_t bufferSize = Ceil(Max(oneLoopMaxElements_ * MAX_MRGSORT_LIST, sortCoreLoopElements), ONE_REPEAT_SORT_NUM) *
+                         ONE_REPEAT_SORT_NUM * sizeof(int32_t) * kvFactor;
+    pipe->InitBuffer(sortDataCopyInQueue, bufferNum, bufferSize);
+    pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, bufferSize);
+    pipe->InitBuffer(sortedBuffer, bufferSize);
+    if (ep_) {
+        pipe->InitBuffer(tempBuffer, bufferSize);
+    }
+}
+
+__aicore__ inline void MoeSortMultiCore::Process()
+{
+    VBSProcess();
+    VMSProcess();
+    SortOutProcess();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_VBS_ONE_CORE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_multi_core_performance.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_multi_core_performance.h
@@ -0,0 +1,171 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_sort_multi_core_performance.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_VBS_ONE_CORE_PERFORMANCE_H
+#define MOE_CUSTOM_VBS_ONE_CORE_PERFORMANCE_H
+
+#include "moe_custom_sort_base.h"
+#include "moe_custom_mrgsort_performance.h"
+#include "moe_custom_mrgsort_out_performance.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+class MoeSortMultiCorePerformance : public MoeSortBase {
+public:
+    __aicore__ inline MoeSortMultiCorePerformance(){};
+    __aicore__ inline void Init(GM_ADDR expendedRowIdx, GM_ADDR workspace, const MoeInitRoutingCustomTilingData *tilingData,
+                                TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void VMSProcess();
+    __aicore__ inline void SortOutProcess();
+    __aicore__ inline void InitMoeMrgSort(MoeMrgsortPerformance *sorter, int64_t coreOffset);
+    __aicore__ inline void InitMoeMrgSortOut(MoeMrgsortOutPerformance *sorter);
+
+private:
+    GlobalTensor<float> workspaceGms[2];
+    GlobalTensor<int32_t> workspaceGatheredSortNumGm_;
+
+    const MoeCustomSortOutComputeTilingData *sortOutTilingData;
+    const MoeCustomVBSComputeTilingData *vbsTilingData;
+
+    // for MoeMrgsortPerformance
+    MoeMrgsortPerformance mrgsorter;
+    MoeMrgsortPerformanceParam mrgsortParam;
+
+    int64_t blockIdx;
+
+    int64_t perListElements;
+    int64_t maxPerListElements;
+};
+
+__aicore__ inline void MoeSortMultiCorePerformance::InitMoeMrgSort(MoeMrgsortPerformance *sorter, int64_t coreOffset)
+{
+    GlobalTensor<float> srcWsGm = workspaceGms[0][this->blockIdx * coreOffset]; // 0-3
+    LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
+    GlobalTensor<int32_t> sortNumGm = workspaceGatheredSortNumGm_[this->blockIdx * MAX_MRGSORT_LIST];
+    for (int64_t i = 0; i < MAX_MRGSORT_LIST; i++) {
+        LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(maxPerListElements) * i];
+        sorter->SetInput(srcWsGm, inLocalT, sortNumGm);
+    }
+    GlobalTensor<float> dstWsGm = workspaceGms[1][this->blockIdx * coreOffset];
+    sorter->SetOutput(dstWsGm, outLocal);
+    sortDataCopyInQueue.FreeTensor(inLocal);
+    sortDataCopyOutQueue.FreeTensor(outLocal);
+}
+
+__aicore__ inline void MoeSortMultiCorePerformance::InitMoeMrgSortOut(MoeMrgsortOutPerformance *sorter)
+{
+    GlobalTensor<float> srcWsGm = workspaceGms[1];
+    LocalTensor<float> inLocal = sortDataCopyInQueue.AllocTensor<float>();
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
+    GlobalTensor<int32_t> sortNumGm = workspaceGatheredSortNumGm_;
+    for (int64_t i = 0; i < MAX_MRGSORT_LIST; i++) {
+        LocalTensor<float> inLocalT = inLocal[GetSortLen<float>(maxPerListElements) * i];
+        sorter->SetInput(srcWsGm, inLocalT, sortNumGm);
+    }
+
+    LocalTensor<float> outLocalV = outLocal[maxPerListElements * MAX_MRGSORT_LIST];
+    sorter->SetOutput(this->sortedexpertIdxGm, this->expendedRowIdxGm, outLocal, outLocalV);
+
+    LocalTensor<float> tempBuffer = sortedBuffer.Get<float>(GetSortLen<float>(maxPerListElements) * MAX_MRGSORT_LIST);
+    sorter->SetBuffer(tempBuffer);
+    sortDataCopyInQueue.FreeTensor(inLocal);
+    sortDataCopyOutQueue.FreeTensor(outLocal);
+}
+
+__aicore__ inline void MoeSortMultiCorePerformance::VMSProcess()
+{
+    int64_t currentStageNeedCoreNum = MAX_MRGSORT_LIST;
+    int64_t coreOffset = GetSortLen<float>(perListElements * MAX_MRGSORT_LIST);
+    if (this->blockIdx <= currentStageNeedCoreNum - 1) {
+        mrgsortParam.perListElements = perListElements;
+        mrgsortParam.oneLoopMaxElements = maxPerListElements;
+        InitMoeMrgSort(&mrgsorter, coreOffset);
+        mrgsorter.Init(&mrgsortParam);
+        mrgsorter.Process();
+    }
+    SyncAll();
+}
+
+__aicore__ inline void MoeSortMultiCorePerformance::SortOutProcess()
+{
+    if (this->blockIdx < 1) {
+        mrgsortParam.perListElements = perListElements;
+        mrgsortParam.oneLoopMaxElements = maxPerListElements;
+        MoeMrgsortOutPerformance sorter;
+        InitMoeMrgSortOut(&sorter);
+        sorter.Init(&mrgsortParam, pipe);
+        sorter.Process();
+        InitGlobalMemory(expertCountTempGm, expertEnd_ - expertStart_, 0);
+        SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+    }
+    SyncAll();
+}
+
+__aicore__ inline void MoeSortMultiCorePerformance::Init(GM_ADDR expendedRowIdx, GM_ADDR workspace,
+                                                         const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    this->totalLength = tilingData->n * tilingData->k;
+    this->blockIdx = GetBlockIdx();
+    this->n = tilingData->n;
+    this->k = tilingData->k;
+    this->vbsTilingData = &(tilingData->vbsComputeParamsOp);
+    this->sortOutTilingData = &(tilingData->sortOutComputeParamsOp);
+    this->perListElements = Ceil(this->totalLength, MAX_MRGSORT_LIST_TOTAL);
+    this->maxPerListElements = this->sortOutTilingData->oneLoopMaxElements;
+
+    expertStart_ = tilingData->expertStart;
+    expertEnd_ = tilingData->expertEnd;
+    rowIdxType_ = tilingData->rowIdxType;
+
+    this->pipe = tPipe;
+    sortedexpertIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(workspace),
+                                      Align(this->totalLength, sizeof(int32_t)));
+    if (rowIdxType_ == SCATTER) {
+        expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx, Align(this->totalLength, sizeof(int32_t)));
+    } else {
+        expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->totalLength, sizeof(int32_t)),
+                                         Align(this->totalLength, sizeof(int32_t)));
+    }
+
+    // key and value
+    int64_t kvFactor = 2;
+    workspaceGms[0].SetGlobalBuffer((__gm__ float *)workspace, Align(this->totalLength, sizeof(float)) * kvFactor);
+    workspaceGms[1].SetGlobalBuffer((__gm__ float *)workspace + Align(this->totalLength, sizeof(float)) * kvFactor,
+                                    Align(this->totalLength, sizeof(float)) * kvFactor);
+    workspaceGatheredSortNumGm_.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                                    Align(this->totalLength, sizeof(int32_t)) * kvFactor * kvFactor,
+                                                MAX_MRGSORT_LIST_TOTAL);
+    expertCountTempGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->totalLength, sizeof(int32_t)) * 2,
+                                      expertEnd_ - expertStart_);
+
+    int64_t bufferSize = Ceil(maxPerListElements * MAX_MRGSORT_LIST, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM *
+                         sizeof(float) * kvFactor;
+    pipe->InitBuffer(sortDataCopyInQueue, bufferNum, bufferSize);
+    pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, bufferSize);
+    pipe->InitBuffer(sortedBuffer, bufferSize);
+    pipe->InitBuffer(tempBuffer, bufferSize);
+}
+
+__aicore__ inline void MoeSortMultiCorePerformance::Process()
+{
+    VMSProcess();
+    SortOutProcess();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_VBS_ONE_CORE_PERFORMANCE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_one_core.h
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_custom_sort_one_core.h
@@ -0,0 +1,167 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_custom_sort_one_core.h
+ * \brief
+ */
+#ifndef MOE_CUSTOM_SORT_ONE_CORE_H
+#define MOE_CUSTOM_SORT_ONE_CORE_H
+
+#include "moe_custom_sort_base.h"
+
+namespace MoeInitRoutingCustom {
+using namespace AscendC;
+
+class MoeSortOneCore : public MoeSortBase {
+public:
+    __aicore__ inline MoeSortOneCore(){};
+    __aicore__ inline void Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
+                                const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn();
+    __aicore__ inline void SortCompute();
+    __aicore__ inline void ExpertCountCompute();
+    __aicore__ inline void CopyOut();
+
+private:
+    int64_t sortNum;
+};
+
+__aicore__ inline void MoeSortOneCore::CopyIn()
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue.AllocTensor<int32_t>();
+    DataCopyExtParams dataCopyParams{static_cast<uint16_t>(1),
+                                     static_cast<uint32_t>(this->totalLength * sizeof(int32_t)), 0, 0, 0};
+    DataCopyPadExtParams dataCopyPadParams{false, 0, 0, 0};
+    DataCopyPad(inLocal[0], expertIdxGm, dataCopyParams, dataCopyPadParams);
+    LocalTensor<int32_t> rowIdxLocal = inLocal[this->sortNum];
+    ArithProgression<int32_t>(rowIdxLocal, 0, 1, this->sortNum);
+    sortDataCopyInQueue.EnQue(inLocal);
+}
+
+__aicore__ inline void MoeSortOneCore::SortCompute()
+{
+    LocalTensor<int32_t> inLocal = sortDataCopyInQueue.DeQue<int32_t>();
+    LocalTensor<int32_t> expertIdx = inLocal[0];
+    LocalTensor<float> expertIdxFp32 = expertIdx.ReinterpretCast<float>();
+    Cast(expertIdxFp32, expertIdx, RoundMode::CAST_ROUND, this->tileLength);
+    Muls(expertIdxFp32, expertIdxFp32, (float)-1, this->tileLength);
+
+    if (ep_) {
+        LocalTensor<uint8_t> maskLocalTensor = sortedBuffer.Get<uint8_t>();
+        AscendC::CompareScalar(maskLocalTensor, expertIdxFp32, static_cast<float>(-expertStart_), AscendC::CMPMODE::GT,
+                               (this->totalLength + ONE_REPEAT_COMPARE_NUM - 1) / ONE_REPEAT_COMPARE_NUM *
+                                   ONE_REPEAT_COMPARE_NUM);
+        LocalTensor<float> floatMinLocalTensor = tempBuffer.Get<float>();
+        Duplicate(floatMinLocalTensor, MIN_FP32, this->tileLength);
+        Select(expertIdxFp32, maskLocalTensor, floatMinLocalTensor, expertIdxFp32, SELMODE::VSEL_TENSOR_TENSOR_MODE,
+               this->totalLength);
+    }
+
+    int64_t duplicateNum = this->totalLength % ONE_REPEAT_SORT_NUM;
+    if (duplicateNum > 0) {
+        int duplicateIndex = this->totalLength - duplicateNum;
+        uint64_t mask0 = UINT64_MAX;
+        mask0 = mask0 << duplicateNum;
+        mask0 = mask0 & (UINT64_MAX >> ONE_REPEAT_SORT_NUM);
+        uint64_t mask[2] = {mask0, 0};
+        Duplicate(expertIdxFp32[duplicateIndex], MIN_FP32, mask, 1, DST_BLK_STRIDE, DST_REP_STRIDE);
+    }
+
+    LocalTensor<float> concatLocal;
+    LocalTensor<float> tempTensor = tempBuffer.Get<float>(GetSortLen<float>(this->sortNum));
+    Concat(concatLocal, expertIdxFp32, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
+
+    LocalTensor<float> sortedLocal = sortedBuffer.Get<float>(GetSortLen<float>(this->sortNum));
+    LocalTensor<uint32_t> sourceRowLocal;
+    sourceRowLocal = inLocal[this->sortNum].ReinterpretCast<uint32_t>();
+    Sort<float, true>(sortedLocal, concatLocal, sourceRowLocal, tempTensor, this->sortNum / ONE_REPEAT_SORT_NUM);
+
+    LocalTensor<float> outLocal = sortDataCopyOutQueue.AllocTensor<float>();
+    LocalTensor<float> sortedExpertForSourceRowLocal = outLocal[0];
+    LocalTensor<uint32_t> expandDstToSrcRowLocal;
+    expandDstToSrcRowLocal = outLocal[this->sortNum].ReinterpretCast<uint32_t>();
+    Extract(sortedExpertForSourceRowLocal, expandDstToSrcRowLocal, sortedLocal, this->sortNum / ONE_REPEAT_SORT_NUM);
+    Muls(sortedExpertForSourceRowLocal, sortedExpertForSourceRowLocal, (float)-1, this->tileLength);
+
+    LocalTensor<int32_t> expertForSourceRowLocalInt32;
+    expertForSourceRowLocalInt32 = sortedExpertForSourceRowLocal.ReinterpretCast<int32_t>();
+    Cast(expertForSourceRowLocalInt32, sortedExpertForSourceRowLocal, RoundMode::CAST_ROUND, this->tileLength);
+    sortDataCopyOutQueue.EnQue<float>(outLocal);
+    sortDataCopyInQueue.FreeTensor(inLocal);
+}
+
+__aicore__ inline void MoeSortOneCore::CopyOut()
+{
+    LocalTensor<int32_t> outLocal = sortDataCopyOutQueue.DeQue<int32_t>();
+    DataCopyParams intriParams;
+    intriParams.blockCount = 1;
+    intriParams.blockLen = this->totalLength * sizeof(int32_t);
+    DataCopyPad(sortedexpertIdxGm, outLocal[0], intriParams);
+    DataCopyPad(expendedRowIdxGm, outLocal[this->sortNum], intriParams);
+    sortDataCopyOutQueue.FreeTensor(outLocal);
+}
+
+__aicore__ inline void MoeSortOneCore::Init(GM_ADDR expertIdx, GM_ADDR expendedRowIdx, GM_ADDR workspace,
+                                            const MoeInitRoutingCustomTilingData *tilingData, TPipe *tPipe)
+{
+    this->pipe = tPipe;
+    this->tileLength = Align(tilingData->vbsComputeParamsOp.lastCorePerLoopElements, sizeof(int32_t));
+    this->sortNum = Ceil(this->tileLength, ONE_REPEAT_SORT_NUM) * ONE_REPEAT_SORT_NUM;
+    this->totalLength = tilingData->n * tilingData->k;
+    this->coreNum = tilingData->coreNum;
+    this->ep_ = tilingData->ep;
+    expertStart_ = tilingData->expertStart;
+    expertEnd_ = tilingData->expertEnd;
+    rowIdxType_ = tilingData->rowIdxType;
+
+    expertIdxGm.SetGlobalBuffer((__gm__ int32_t *)expertIdx, this->tileLength);
+    sortedexpertIdxGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(workspace),
+                                      Align(this->totalLength, sizeof(int32_t)));
+    if (rowIdxType_ == SCATTER) {
+        expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)expendedRowIdx, this->tileLength);
+    } else {
+        expendedRowIdxGm.SetGlobalBuffer((__gm__ int32_t *)workspace + Align(this->tileLength, sizeof(int32_t)),
+                                         Align(this->tileLength, sizeof(int32_t)));
+    }
+
+    if (GetBlockIdx() == 0) {
+        expertCountTempGm.SetGlobalBuffer((__gm__ int32_t *)workspace +
+                                              Align(tilingData->n * tilingData->k, sizeof(int32_t)) * 2,
+                                          tilingData->actualExpertNum);
+        InitGlobalMemory(expertCountTempGm, tilingData->actualExpertNum, 0);
+        SetWaitFlag<HardEvent::MTE3_MTE2>(HardEvent::MTE3_MTE2);
+    }
+
+    int64_t coreNum = GetBlockNum();
+
+    // key and value
+    int64_t kvFactor = 2;
+    int64_t buffSize = this->sortNum * sizeof(int32_t) * kvFactor;
+    pipe->InitBuffer(sortDataCopyInQueue, bufferNum, buffSize);
+    pipe->InitBuffer(sortDataCopyOutQueue, bufferNum, buffSize);
+    pipe->InitBuffer(tempBuffer, buffSize);
+    pipe->InitBuffer(sortedBuffer, buffSize);
+}
+
+__aicore__ inline void MoeSortOneCore::Process()
+{
+    if (GetBlockIdx() < 1) {
+        CopyIn();
+        SortCompute();
+        CopyOut();
+    }
+    this->SyncAll();
+}
+} // namespace MoeInitRoutingCustom
+#endif // MOE_CUSTOM_SORT_ONE_CORE_H
--- a/csrc/moe_init_routing_custom/op_kernel/moe_init_routing_custom.cpp
+++ b/csrc/moe_init_routing_custom/op_kernel/moe_init_routing_custom.cpp
@@ -0,0 +1,412 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file moe_init_routing_custom.cpp
+ * \brief
+ */
+#include "moe_custom_mrgsort_out.h"
+#include "moe_custom_mrgsort.h"
+#include "moe_custom_sort_one_core.h"
+#include "moe_custom_sort_multi_core.h"
+#include "moe_custom_gather_sort_multi_core.h"
+#include "moe_custom_expert_tokens_count.h"
+#include "moe_custom_row_idx_gather.h"
+#include "moe_custom_gather_out.h"
+#include "moe_custom_gather_dynamic_quant.h"
+#include "moe_custom_gather_static_quant.h"
+#include "moe_custom_full_load.h"
+#include "moe_custom_full_load_dynamic_quant.h"
+#include "moe_custom_full_load_static_quant.h"
+#include "moe_custom_full_load_unquantized.h"
+#include "moe_custom_sort_actual_expert.h"
+#include "moe_custom_sort_multi_core_performance.h"
+#include "moe_custom_row_idx_gather_droppad_dynamic.h"
+#include "moe_custom_row_idx_gather_droppad.h"
+#include "moe_custom_gather_out_droppad.h"
+#include "moe_custom_gather_droppad_static_quant.h"
+
+#define MOE_INIT_ROUTING_CUSTOM_PERFORMANCE 2000000
+#define UNQUANTIZED_FULLLOAD 2100000
+#define STATIC_QUANT_FULLLOAD 2200000
+#define DYNAMIC_QUANT_GATHER_NO_SCALE_FULLLOAD 2300000
+#define DYNAMIC_QUANT_GATHER_1H_DIM_SCALE_FULLLOAD 2301000
+#define DYNAMIC_QUANT_GATHER_EH_SCALE_FULLLOAD 2302000
+#define DYNAMIC_QUANT_SCATTER_NO_SCALE_FULLLOAD 2310000
+#define DYNAMIC_QUANT_SCATTER_1H_SCALE_FULLLOAD 2311000
+#define DYNAMIC_QUANT_SCATTER_EH_SCALE_FULLLOAD 2312000
+
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_NODROP 1000000
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_SCATTER_NODROP 1001000
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_NODROP 1100000
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_SCATTER_NODROP 1101000
+
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_NODROP 1020000
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_SCATTER_NODROP 1021000
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_NODROP 1120000
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_SCATTER_NODROP 1121000
+
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_NODROP 1010000
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_SCATTER_NODROP 1011000
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_NODROP 1110000
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_SCATTER_NODROP 1111000
+
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP 1000100
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP 1100100
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_DROP 1020100
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_DROP 1120100
+#define MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP 1010100
+#define MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP 1110100
+
+#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER 1200000
+#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER 1201000
+#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER 1300000
+#define MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER 1301000
+
+
+using namespace AscendC;
+using namespace MoeInitRoutingCustom;
+extern "C" __global__ __aicore__ void moe_init_routing_custom(GM_ADDR x, GM_ADDR expertIdx, GM_ADDR scale, GM_ADDR offset,
+                                                          GM_ADDR expandedX, GM_ADDR expandedRowIdx,
+                                                          GM_ADDR expertTokensCountOrCumsum, GM_ADDR expandedScale,
+                                                          GM_ADDR workspace, GM_ADDR tiling)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);
+    if (g_coreType == AIC) {
+        return;
+    }
+
+    GET_TILING_DATA(tilingData, tiling);
+    if (workspace == nullptr) {
+        return;
+    }
+
+    GM_ADDR userWS = GetUserWorkspace(workspace);
+    if (userWS == nullptr) {
+        return;
+    }
+
+    auto t = &tilingData;
+
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_PERFORMANCE)) {
+        TPipe fullLoadPipe;
+        MoeCustomFullLoad op;
+        op.Init(x, expertIdx, scale, offset, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, t,
+                &fullLoadPipe);
+        op.Process();
+        fullLoadPipe.Destroy();
+        return;
+    }
+
+    if (TILING_KEY_IS(DYNAMIC_QUANT_GATHER_NO_SCALE_FULLLOAD)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe fullLoadPipe;
+            MoeCustomFullLoadDynamicQuant<DTYPE_X, GATHER, NO_SCALE> op;
+            op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                    &fullLoadPipe);
+            op.Process();
+            fullLoadPipe.Destroy();
+        }
+        return;
+    }
+
+    if (TILING_KEY_IS(DYNAMIC_QUANT_GATHER_1H_DIM_SCALE_FULLLOAD)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe fullLoadPipe;
+            MoeCustomFullLoadDynamicQuant<DTYPE_X, GATHER, SCALE_1H> op;
+            op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                    &fullLoadPipe);
+            op.Process();
+            fullLoadPipe.Destroy();
+        }
+        return;
+    }
+
+    if (TILING_KEY_IS(DYNAMIC_QUANT_GATHER_EH_SCALE_FULLLOAD)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe fullLoadPipe;
+            MoeCustomFullLoadDynamicQuant<DTYPE_X, GATHER, SCALE_EH> op;
+            op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                    &fullLoadPipe);
+            op.Process();
+            fullLoadPipe.Destroy();
+        }
+        return;
+    }
+
+    if (TILING_KEY_IS(DYNAMIC_QUANT_SCATTER_NO_SCALE_FULLLOAD)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe fullLoadPipe;
+            MoeCustomFullLoadDynamicQuant<DTYPE_X, SCATTER, NO_SCALE> op;
+            op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                    &fullLoadPipe);
+            op.Process();
+            fullLoadPipe.Destroy();
+        }
+        return;
+    }
+
+    if (TILING_KEY_IS(DYNAMIC_QUANT_SCATTER_1H_SCALE_FULLLOAD)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe fullLoadPipe;
+            MoeCustomFullLoadDynamicQuant<DTYPE_X, SCATTER, SCALE_1H> op;
+            op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                    &fullLoadPipe);
+            op.Process();
+            fullLoadPipe.Destroy();
+        }
+        return;
+    }
+
+    if (TILING_KEY_IS(DYNAMIC_QUANT_SCATTER_EH_SCALE_FULLLOAD)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe fullLoadPipe;
+            MoeCustomFullLoadDynamicQuant<DTYPE_X, SCATTER, SCALE_EH> op;
+            op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                    &fullLoadPipe);
+            op.Process();
+            fullLoadPipe.Destroy();
+        }
+        return;
+    }
+
+    if (TILING_KEY_IS(UNQUANTIZED_FULLLOAD)) {
+        TPipe fullLoadPipe;
+        MoeCustomFullLoadUnquantized<DTYPE_X> op;
+        op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                &fullLoadPipe);
+        op.Process();
+        fullLoadPipe.Destroy();
+        return;
+    }
+
+    if (TILING_KEY_IS(STATIC_QUANT_FULLLOAD)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe fullLoadPipe;
+            MoeCustomFullLoadStaticQuant<DTYPE_X> op;
+            op.Init(x, expertIdx, scale, offset, expandedX, expandedRowIdx, expertTokensCountOrCumsum, userWS, t,
+                    &fullLoadPipe);
+            op.Process();
+            fullLoadPipe.Destroy();
+        }
+        return;
+    }
+
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER)) {
+        TPipe sortActualExpertPipe;
+        MoeSortActualExpert<DTYPE_X> op;
+        bool isFinished = false;
+        op.Init(x, expertIdx, scale, expandedX, expandedRowIdx, expertTokensCountOrCumsum, expandedScale, userWS, t,
+                &sortActualExpertPipe);
+        isFinished = op.Process();
+        sortActualExpertPipe.Destroy();
+        if (isFinished) {
+            return;
+        }
+    }
+
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
+        TPipe gatherSortMultiCorePipe;
+        MoeGatherSortMultiCore op;
+        op.Init(expertIdx, expandedRowIdx, userWS, t, &gatherSortMultiCorePipe);
+        op.Process();
+        gatherSortMultiCorePipe.Destroy();
+    }
+
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
+        TPipe mergeSortMultiCorePipe;
+        MoeSortMultiCorePerformance op;
+        op.Init(expandedRowIdx, userWS, t, &mergeSortMultiCorePipe);
+        op.Process();
+        mergeSortMultiCorePipe.Destroy();
+    }
+
+    TPipe sortPipe;
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_SCATTER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_SCATTER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_SCATTER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_DROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP)) {
+        MoeSortOneCore op;
+        op.Init(expertIdx, expandedRowIdx, userWS, t, &sortPipe);
+        op.Process();
+    } else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_SCATTER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_SCATTER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_SCATTER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_DROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP)) {
+        MoeSortMultiCore op;
+        op.Init(expertIdx, expandedRowIdx, userWS, t, &sortPipe);
+        op.Process();
+    }
+    sortPipe.Destroy();
+
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_GATHER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_GATHER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
+        TPipe histogramPipe;
+        if (t->expertTokensNumType == CUMSUM_MODE) {
+            ExpertTokensCount<CUMSUM_MODE> countOp;
+            countOp.Init<true>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
+            countOp.Process();
+            histogramPipe.Destroy();
+        } else if (t->expertTokensNumType == COUNT_MODE) {
+            ExpertTokensCount<COUNT_MODE> countOp;
+            countOp.Init<true>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
+            countOp.Process();
+            histogramPipe.Destroy();
+        } else {
+            ExpertTokensCount<KEY_VALUE_MODE> countOp;
+            countOp.Init<true>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
+            countOp.Process();
+            histogramPipe.Destroy();
+        }
+
+    } else {
+        if (t->dropPadMode == 1 || t->ep == 1 || t->expertTokensNumFlag != EXERPT_TOKENS_NONE) {
+            TPipe histogramPipe;
+            if (t->expertTokensNumType == CUMSUM_MODE) {
+                ExpertTokensCount<CUMSUM_MODE> countOp;
+                countOp.Init<false>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
+                countOp.Process();
+                histogramPipe.Destroy();
+            } else if (t->expertTokensNumType == COUNT_MODE) {
+                ExpertTokensCount<COUNT_MODE> countOp;
+                countOp.Init<false>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
+                countOp.Process();
+                histogramPipe.Destroy();
+            } else {
+                ExpertTokensCount<KEY_VALUE_MODE> countOp;
+                countOp.Init<false>(expandedRowIdx, expertTokensCountOrCumsum, userWS, t, &histogramPipe);
+                countOp.Process();
+                histogramPipe.Destroy();
+            }
+        }
+    }
+
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP)) {
+        TPipe rowIdxGatherDropPadPipe;
+        MoeCustomSrcToDstWithCapacity<DTYPE_X, MoeInitRoutingCustomTilingData> rowIdxGatherDropPadOp;
+        rowIdxGatherDropPadOp.Init(expandedRowIdx, expandedX, expandedScale, userWS, t, &rowIdxGatherDropPadPipe);
+        rowIdxGatherDropPadOp.Process();
+        rowIdxGatherDropPadPipe.Destroy();
+    } else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP)) {
+        TPipe rowIdxGatherDropPadPipe;
+        MoeCustomSrcToDstWithCapacity<int8_t, MoeInitRoutingCustomTilingData> rowIdxGatherDropPadOp;
+        rowIdxGatherDropPadOp.Init(expandedRowIdx, expandedX, expandedScale, userWS, t, &rowIdxGatherDropPadPipe);
+        rowIdxGatherDropPadOp.Process();
+        rowIdxGatherDropPadPipe.Destroy();
+    } else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_DROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_DROP)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe gatherPipe;
+            MoeCustomSrcToDstAndGather<DTYPE_X, MoeInitRoutingCustomTilingData> gatherDroppadDynamicQuantOp;
+            gatherDroppadDynamicQuantOp.Init(x, scale, expandedRowIdx, expandedX, expandedScale, userWS, t,
+                                             &gatherPipe);
+            gatherDroppadDynamicQuantOp.Process();
+            gatherPipe.Destroy();
+        }
+    } else {
+        TPipe rowIdxPipe;
+        RowIdxGather rowIdxGatherOp;
+        rowIdxGatherOp.Init(expandedRowIdx, userWS, t, &rowIdxPipe);
+        rowIdxGatherOp.Process();
+        rowIdxPipe.Destroy();
+    }
+
+    if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_SCATTER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_SCATTER_NODROP) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTONECORE_SCATTER) ||
+        TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_GATHER_SORTMULTICORE_SCATTER)) {
+        TPipe gatherPipe;
+        if (t->ep == 1) {
+            MoeGatherOut<DTYPE_X, 1> gatherOp;
+            gatherOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
+            gatherOp.Process();
+            gatherPipe.Destroy();
+        } else {
+            MoeGatherOut<DTYPE_X, 0> gatherOp;
+            gatherOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
+            gatherOp.Process();
+            gatherPipe.Destroy();
+        }
+
+    } else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_SCATTER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_DYNAMICQUANT_GATHER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_SCATTER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_DYNAMICQUANT_GATHER_NODROP)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe gatherPipe;
+            if (t->ep == 0 and t->smoothType != SCALE_EH) {
+                MoeGatherOutDynamicQuant<DTYPE_X, GATHER> gatherDynamicQuantOp;
+                gatherDynamicQuantOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
+                gatherDynamicQuantOp.Process();
+                gatherPipe.Destroy();
+            } else {
+                MoeGatherOutDynamicQuant<DTYPE_X, SCATTER> gatherDynamicQuantOp;
+                gatherDynamicQuantOp.Init(x, scale, userWS, expandedRowIdx, expandedX, expandedScale, t, &gatherPipe);
+                gatherDynamicQuantOp.Process();
+                gatherPipe.Destroy();
+            }
+        }
+    } else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_SCATTER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_SCATTER_NODROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_NODROP)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe gatherPipe;
+            if (t->ep == 1) {
+                MoeGatherOutQuant<DTYPE_X, 1> gatherStaticQuantOp;
+                gatherStaticQuantOp.Init(x, scale, offset, expandedRowIdx, expandedX, userWS, t, &gatherPipe);
+                gatherStaticQuantOp.Process();
+                gatherPipe.Destroy();
+            } else {
+                MoeGatherOutQuant<DTYPE_X, 0> gatherStaticQuantOp;
+                gatherStaticQuantOp.Init(x, scale, offset, expandedRowIdx, expandedX, userWS, t, &gatherPipe);
+                gatherStaticQuantOp.Process();
+                gatherPipe.Destroy();
+            }
+        }
+    } else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_GATHER_DROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_GATHER_DROP)) {
+        TPipe gatherPipe;
+        MoeGatherOutDroppad<DTYPE_X> gatherDroppadOp;
+        gatherDroppadOp.Init(x, scale, expandedRowIdx, expandedX, expandedScale, userWS, t, &gatherPipe);
+        gatherDroppadOp.Process();
+        gatherPipe.Destroy();
+    } else if (TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTONECORE_QUANT_GATHER_DROP) ||
+               TILING_KEY_IS(MOE_INIT_ROUTING_CUSTOM_SORTMULTICORE_QUANT_GATHER_DROP)) {
+        if constexpr (!IsSameType<DTYPE_X, int8_t>::value) {
+            TPipe gatherPipe;
+            MoeGatherDroppadQuant<DTYPE_X> gatherDroppadStaticQuantOp;
+            gatherDroppadStaticQuantOp.Init(x, scale, offset, expandedRowIdx, expandedX, userWS, t, &gatherPipe);
+            gatherDroppadStaticQuantOp.Process();
+            gatherPipe.Destroy();
+        }
+    }
+}
--- a/csrc/torch_binding.cpp
+++ b/csrc/torch_binding.cpp
@@ -1118,6 +1118,106 @@ at::Tensor combine_prefill(const at::Tensor& x, const at::Tensor& topk_idx, cons
    return combined_x;
 }

+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_moe_init_routing_custom(
+    const at::Tensor &x, const at::Tensor &expert_idx,
+    const c10::optional<at::Tensor> &scale, const c10::optional<at::Tensor> &offset, int64_t active_num,
+    int64_t expert_capacity, int64_t expert_num, int64_t drop_pad_mode, int64_t expert_tokens_num_type,
+    bool expert_tokens_num_flag, int64_t quant_mode, at::IntArrayRef active_expert_range, int64_t row_idx_type)
+{
+    constexpr int64_t DIM_X = 2;
+    constexpr int64_t DIM_EXPERT_IDX = 2;
+    constexpr int64_t LENGTH_ACTIVE_EXPERT_RANGE = 2;
+    constexpr int64_t EXPERT_TOKENS_COUNT = 1;
+    constexpr int64_t EXPERT_TOKENS_KEY_VALUE = 2;
+    constexpr int64_t QUANT_MODE_UNQUANT = -1;
+    constexpr int64_t QUANT_MODE_DYNAMIC_QUANT = 1;
+    constexpr int64_t CUMSUM = 0;
+    constexpr int64_t COUNT = 1;
+    constexpr int64_t KEY_VALUE = 2;
+
+    if (active_expert_range.empty()) {
+        active_expert_range =  at::IntArrayRef({0, expert_num});
+    }
+
+    int64_t x_dim = x.dim();
+    TORCH_CHECK(x_dim == DIM_X, "The x should be ", DIM_X, 
+                "-Dimension, current is ", x_dim, "-Dimension.");
+
+    int64_t expert_idx_dim = expert_idx.dim();
+    TORCH_CHECK(expert_idx_dim == DIM_EXPERT_IDX, "The expert_idx should be ", DIM_EXPERT_IDX, 
+                "-Dimension, current is ", expert_idx_dim, "-Dimension.");
+
+    int64_t active_expert_range_length = active_expert_range.size();
+    TORCH_CHECK(active_expert_range_length == LENGTH_ACTIVE_EXPERT_RANGE, "The active_expert_range should be ", LENGTH_ACTIVE_EXPERT_RANGE, 
+                "-Dimension, current is ", expert_idx_dim, "-Dimension.");
+
+    int expert_length = active_expert_range[1] - active_expert_range[0];
+    auto x_size = x.sizes();
+    auto expert_idx_size = expert_idx.sizes();
+
+    int bs = x_size[0];
+    int h = x_size[1];
+    int k = expert_idx_size[1];
+    int64_t expanded_scale_len = 0;
+    at::Tensor expanded_x;
+
+    if (drop_pad_mode == 1) { // Drop/Pad
+        if (quant_mode == QUANT_MODE_UNQUANT) {
+            expanded_x = at::empty({expert_num, expert_capacity, h}, x.options());
+        } else {
+            expanded_x = at::empty({expert_num, expert_capacity, h}, x.options().dtype(at::kChar));
+        }
+        expanded_scale_len = expert_num * expert_capacity;
+    } else { // Dropless / Active
+        if (active_num > 0) { // Active
+            int64_t num_out_tokens = std::min((int64_t)bs * k, active_num);
+            if (quant_mode == QUANT_MODE_UNQUANT) {
+                expanded_x = at::empty({num_out_tokens, h}, x.options());
+            } else {
+                expanded_x = at::empty({num_out_tokens, h}, x.options().dtype(at::kChar));
+            }
+            expanded_scale_len = num_out_tokens;
+        } else { // Dropless
+            if (quant_mode == QUANT_MODE_UNQUANT) {
+                expanded_x = at::empty({bs * k, h}, x.options());
+            } else {
+                expanded_x = at::empty({bs * k, h}, x.options().dtype(at::kChar));
+            }
+            expanded_scale_len = bs * k;
+        }
+    }
+
+    at::Tensor expanded_row_idx = at::empty({bs * k}, expert_idx.options());
+    at::Tensor expert_tokens_count_or_cumsum;
+    if (expert_tokens_num_type >= CUMSUM && expert_tokens_num_type <= COUNT) {
+        // expert_tokens_count_or_cumsum in [end-start, ]
+        expert_tokens_count_or_cumsum = at::empty({expert_length}, x.options().dtype(at::kLong));
+    } else if (expert_tokens_num_type == KEY_VALUE) {
+        // key_value in [2, end-start]
+        expert_tokens_count_or_cumsum = at::empty({expert_num, 2}, x.options().dtype(at::kLong));
+    }
+    at::Tensor expanded_scale = at::empty({expanded_scale_len}, x.options().dtype(at::kFloat));
+    EXEC_NPU_CMD(aclnnMoeInitRoutingCustom,
+                 x,
+                 expert_idx,
+                 scale,
+                 offset,
+                 active_num,
+                 expert_capacity,
+                 expert_num,
+                 drop_pad_mode,
+                 expert_tokens_num_type,
+                 expert_tokens_num_flag,
+                 quant_mode,
+                 active_expert_range,
+                 row_idx_type,
+                 expanded_x,
+                 expanded_row_idx,
+                 expert_tokens_count_or_cumsum,
+                 expanded_scale);
+    return std::tie(expanded_x, expanded_row_idx, expert_tokens_count_or_cumsum, expanded_scale);
+}
+
 } // namespace vllm_ascend

 TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
@@ -1257,4 +1357,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
            "num_ranks) -> Tensor");
    ops.impl("combine_prefill", torch::kPrivateUse1,
             &vllm_ascend::combine_prefill);
+    ops.def(
+        "npu_moe_init_routing_custom(Tensor x, Tensor expert_idx, *, Tensor? scale=None, Tensor? offset=None, int active_num=-1, "
+        "                            int expert_capacity=-1, int expert_num=-1, int drop_pad_mode=0, int expert_tokens_num_type=0, "
+        "                            bool expert_tokens_num_flag=False, int quant_mode=0, int[2] active_expert_range=[], "
+        "                            int row_idx_type=0) -> (Tensor, Tensor, Tensor, Tensor)"
+    );
+    ops.impl("npu_moe_init_routing_custom", torch::kPrivateUse1, &vllm_ascend::npu_moe_init_routing_custom);
 }
--- a/csrc/torch_binding_meta.cpp
+++ b/csrc/torch_binding_meta.cpp
@@ -283,6 +283,89 @@ std::tuple<at::Tensor, at::Tensor> matmul_allreduce_add_rmsnorm_meta(
        return {output, add_out};
    }

+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_moe_init_routing_custom_meta(
+    const at::Tensor &x, const at::Tensor &expert_idx,
+    const c10::optional<at::Tensor> &scale, const c10::optional<at::Tensor> &offset, int64_t active_num,
+    int64_t expert_capacity, int64_t expert_num, int64_t drop_pad_mode, int64_t expert_tokens_num_type,
+    bool expert_tokens_num_flag, int64_t quant_mode, at::IntArrayRef active_expert_range, int64_t row_idx_type)
+{
+    constexpr int64_t DIM_X = 2;
+    constexpr int64_t DIM_EXPERT_IDX = 2;
+    constexpr int64_t LENGTH_ACTIVE_EXPERT_RANGE = 2;
+    constexpr int64_t EXPERT_TOKENS_COUNT = 1;
+    constexpr int64_t EXPERT_TOKENS_KEY_VALUE = 2;
+    constexpr int64_t QUANT_MODE_UNQUANT = -1;
+    constexpr int64_t QUANT_MODE_DYNAMIC_QUANT = 1;
+    constexpr int64_t CUMSUM = 0;
+    constexpr int64_t COUNT = 1;
+    constexpr int64_t KEY_VALUE = 2;
+
+    if (active_expert_range.empty()) {
+        active_expert_range =  at::IntArrayRef({0, expert_num});
+    }
+
+    int64_t x_dim = x.dim();
+    TORCH_CHECK(x_dim == DIM_X, "The x should be ", DIM_X, 
+                "-Dimension, current is ", x_dim, "-Dimension.");
+
+    int64_t expert_idx_dim = expert_idx.dim();
+    TORCH_CHECK(expert_idx_dim == DIM_EXPERT_IDX, "The expert_idx should be ", DIM_EXPERT_IDX, 
+                "-Dimension, current is ", expert_idx_dim, "-Dimension.");
+
+    int64_t active_expert_range_length = active_expert_range.size();
+    TORCH_CHECK(active_expert_range_length == LENGTH_ACTIVE_EXPERT_RANGE, "The active_expert_range should be ", LENGTH_ACTIVE_EXPERT_RANGE, 
+                "-Dimension, current is ", expert_idx_dim, "-Dimension.");
+
+    int expert_length = active_expert_range[1] - active_expert_range[0];
+    auto x_size = x.sizes();
+    auto expert_idx_size = expert_idx.sizes();
+
+    int bs = x_size[0];
+    int h = x_size[1];
+    int k = expert_idx_size[1];
+    int64_t expanded_scale_len = 0;
+    at::Tensor expanded_x;
+
+    if (drop_pad_mode == 1) { // Drop/Pad
+        if (quant_mode == QUANT_MODE_UNQUANT) {
+            expanded_x = at::empty({expert_num, expert_capacity, h}, x.options());
+        } else {
+            expanded_x = at::empty({expert_num, expert_capacity, h}, x.options().dtype(at::kChar));
+        }
+        expanded_scale_len = expert_num * expert_capacity;
+    } else { // Dropless / Active
+        if (active_num > 0) { // Active
+            int64_t num_out_tokens = std::min((int64_t)bs * k, active_num);
+            if (quant_mode == QUANT_MODE_UNQUANT) {
+                expanded_x = at::empty({num_out_tokens, h}, x.options());
+            } else {
+                expanded_x = at::empty({num_out_tokens, h}, x.options().dtype(at::kChar));
+            }
+            expanded_scale_len = num_out_tokens;
+        } else { // Dropless
+            if (quant_mode == QUANT_MODE_UNQUANT) {
+                expanded_x = at::empty({bs * k, h}, x.options());
+            } else {
+                expanded_x = at::empty({bs * k, h}, x.options().dtype(at::kChar));
+            }
+            expanded_scale_len = bs * k;
+        }
+    }
+
+    at::Tensor expanded_row_idx = at::empty({bs * k}, expert_idx.options());
+    at::Tensor expert_tokens_count_or_cumsum;
+    if (expert_tokens_num_type >= CUMSUM && expert_tokens_num_type <= COUNT) {
+        // expert_tokens_count_or_cumsum in [end-start, ]
+        expert_tokens_count_or_cumsum = at::empty({expert_length}, x.options().dtype(at::kLong));
+    } else if (expert_tokens_num_type == KEY_VALUE) {
+        // key_value in [2, end-start]
+        expert_tokens_count_or_cumsum = at::empty({expert_num, 2}, x.options().dtype(at::kLong));
+    }
+
+    at::Tensor expanded_scale = at::empty({expanded_scale_len}, x.options().dtype(at::kFloat));
+    return {expanded_x, expanded_row_idx, expert_tokens_count_or_cumsum, expanded_scale};
+}
+
 } // namespace meta
 } // namespace vllm_ascend

@@ -316,5 +399,7 @@ TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
    ops.impl("dispatch_ffn_combine", &vllm_ascend::meta::dispatch_ffn_combine_meta);
    // matmul allreduce add rmsnorm
    ops.impl("matmul_allreduce_add_rmsnorm", &vllm_ascend::meta::matmul_allreduce_add_rmsnorm_meta);
+    // moe_init_routing_custom
+    ops.impl("npu_moe_init_routing_custom", &vllm_ascend::meta::npu_moe_init_routing_custom_meta);
 }
 }
--- a/tests/e2e/nightly/ops/test_moe_init_routing_custom.py
+++ b/tests/e2e/nightly/ops/test_moe_init_routing_custom.py
@@ -0,0 +1,349 @@
+import itertools
+import random
+
+import numpy as np
+import torch
+
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
+
+
+def adapter_capacity(sorted_row_idx, sorted_expert_idx, capacity):
+    count = 0
+    last = sorted_expert_idx[0]
+    for i, val in enumerate(sorted_expert_idx):
+        if last != val:
+            count = 1
+            last = val
+        else:
+            count += 1
+            if count > capacity:
+                sorted_expert_idx[i] = -1
+                sorted_row_idx[i] = -1
+
+
+def moe_init_routing_golden(x, expert_idx, scale, offset, active_num,
+                            expert_capacity, expert_num, drop_pad_mode,
+                            expert_tokens_num_type, expert_tokens_num_flag,
+                            active_expert_range, quant_mode, row_idx_type):
+    if drop_pad_mode == 1:
+        if expert_num <= 0:
+            print("expert num can not be 0")
+            return
+    expert_start = active_expert_range[0] if drop_pad_mode == 0 else 0
+    expert_end = active_expert_range[1] if drop_pad_mode == 0 else expert_num
+    num_rows = x.shape[0]
+    h = x.shape[1]
+    k = expert_idx.shape[-1]
+    expert_idx_in = expert_idx.copy().reshape(-1)
+    actual_expert_total_num: int = np.sum((expert_idx_in >= expert_start)
+                                          & (expert_idx_in < expert_end))
+
+    expert_idx_in[(expert_idx_in
+                   < expert_start)] = np.int32(np.iinfo(np.int32).max)
+    sorted_expert_indices = np.argsort(expert_idx_in, axis=-1, kind="stable")
+    sorted_expert_idx = expert_idx_in[sorted_expert_indices]
+    if row_idx_type == 1:
+        expanded_row_idx = sorted_expert_indices[:actual_expert_total_num]
+    else:
+        expanded_row_idx = np.ones(num_rows * k).astype(np.int32) * -1
+        tmp_indices = np.arange(actual_expert_total_num)
+        expanded_row_idx[
+            sorted_expert_indices[:actual_expert_total_num]] = tmp_indices
+
+    if not expert_tokens_num_flag:
+        expert_tokens_count = torch.tensor([0])
+    else:
+        if drop_pad_mode == 0:
+            if expert_tokens_num_type == 1:
+                expert_tokens_count = np.bincount(
+                    sorted_expert_idx[:actual_expert_total_num] - expert_start)
+                expert_tokens_count = np.concatenate([
+                    expert_tokens_count,
+                    np.zeros((expert_end - expert_start) -
+                             len(expert_tokens_count)).astype(np.int64)
+                ])
+            elif expert_tokens_num_type == 0:
+                expert_tokens_count = np.bincount(
+                    sorted_expert_idx[:actual_expert_total_num] - expert_start)
+                expert_tokens_count = np.concatenate([
+                    expert_tokens_count,
+                    np.zeros((expert_end - expert_start) -
+                             len(expert_tokens_count)).astype(np.int64)
+                ])
+                expert_tokens_count = np.cumsum(expert_tokens_count)
+            elif expert_tokens_num_type == 2:
+                expert_id, counts = np.unique(
+                    sorted_expert_idx[:actual_expert_total_num],
+                    return_counts=True)
+                expert_tokens_count = np.column_stack((expert_id, counts))
+                if expert_tokens_count.shape[0] < expert_num:
+                    expert_tokens_count = np.concatenate(
+                        (expert_tokens_count, [
+                            [0, 0],
+                        ]), axis=0)
+        else:
+            expert_tokens_count = np.bincount(
+                sorted_expert_idx[:actual_expert_total_num] - expert_start)
+            zeros_array = np.zeros(
+                (expert_end - expert_start) - len(expert_tokens_count),
+                dtype=np.int64)
+            expert_tokens_count = np.concatenate(
+                [expert_tokens_count, zeros_array])
+        expert_tokens_count = expert_tokens_count.astype(np.int64)
+
+    if drop_pad_mode == 0:
+        if active_num == 0:
+            active_num = actual_expert_total_num
+        else:
+            active_num = min(active_num, actual_expert_total_num)
+        expanded_scale = None
+        expanded_x = x[sorted_expert_indices[:active_num] // k, :]
+        if scale is not None and quant_mode == -1:
+            expanded_scale = scale[sorted_expert_indices[:active_num] // k]
+    else:
+        adapter_capacity(sorted_expert_indices, sorted_expert_idx,
+                         expert_capacity)
+
+        sort_row_tmp = np.full((expert_num * expert_capacity), -1, dtype=int)
+        offset_tmp = 0
+        lastExpertId = 0
+        for i, val in enumerate(sorted_expert_indices):
+            if val != -1:
+                if lastExpertId != sorted_expert_idx[i]:
+                    offset_tmp = 0
+                    lastExpertId = sorted_expert_idx[i]
+                sort_row_tmp[sorted_expert_idx[i] * expert_capacity +
+                             offset_tmp] = sorted_expert_indices[i]
+                offset_tmp = offset_tmp + 1
+
+        expanded_row_idx = np.full(sorted_expert_indices.shape, -1)
+        for i, val in enumerate(sort_row_tmp):
+            if val != -1:
+                expanded_row_idx[val] = i
+
+        expanded_x_mask = np.full((expert_num * expert_capacity, h),
+                                  1,
+                                  dtype=int)
+        expanded_x = np.full((expert_num * expert_capacity, h),
+                             0,
+                             dtype=x.dtype)
+        for i, val in enumerate(sort_row_tmp):
+            if val != -1:
+                expanded_x[i] = x[val // k]
+                expanded_x_mask[i] = np.full((h, ), 0, dtype=int)
+
+    if quant_mode == -1:
+        expanded_x = expanded_x
+        expanded_row_idx = expanded_row_idx
+        if scale is not None and drop_pad_mode == 1:
+            expanded_scale = np.full((expert_num * expert_capacity, ),
+                                     0,
+                                     dtype=scale.dtype)
+            for i, val in enumerate(sort_row_tmp):
+                if val != -1:
+                    expanded_scale[i] = scale[val // k]
+        if scale is None:
+            expanded_scale = None
+
+    if quant_mode == 0:
+        expanded_scale = None
+        expanded_x_fp16 = expanded_x.astype(np.float16)
+        if scale is not None:
+            scale_val = scale.astype(np.float16)
+        else:
+            raise ValueError("scale cannot be None when quant_mode is 0")
+        if offset is not None:
+            offset_val = offset.astype(np.float16)
+        else:
+            raise ValueError("offset cannot be None when quant_mode is 0")
+        scale_rst = expanded_x_fp16 * scale_val[0]
+        add_offset = scale_rst + offset_val[0]
+        round_data = np.rint(add_offset)
+        round_data = np.clip(round_data, -128, 127)
+        expanded_x = round_data.astype(np.int8)
+
+    if quant_mode == 1:
+        x_final = expanded_x.astype(np.float32)
+        if scale is None:
+            x_abs = np.abs(x_final)
+            x_max = np.max(x_abs, axis=-1, keepdims=True)
+            expanded_scale = x_max / 127
+            expanded_x = x_final / expanded_scale
+            expanded_x = np.round(expanded_x).astype(np.int8)
+        else:
+            if scale.shape[0] == 1:
+                x_final = x_final * scale
+            else:
+                if drop_pad_mode == 0:
+                    x_final = x_final * scale[sorted_expert_idx[:active_num] -
+                                              expert_start]
+
+                else:
+                    for i, val in enumerate(sort_row_tmp):
+                        if val != -1:
+                            x_final[i] = x_final[i] * scale[i //
+                                                            expert_capacity]
+            x_abs = np.abs(x_final)
+            x_max = np.max(x_abs, axis=-1, keepdims=True)
+            expanded_scale = x_max / 127
+            expanded_x = x_final / expanded_scale
+            expanded_x = np.round(expanded_x).astype(np.int8)
+        if x.dtype == np.int8:
+            expanded_scale = None
+    if drop_pad_mode == 1:
+        expanded_x = np.ma.array(expanded_x, mask=expanded_x_mask).filled(0)
+        expanded_x = expanded_x.reshape(expert_num, expert_capacity, h)
+
+    return expanded_x, expanded_row_idx, expert_tokens_count, expanded_scale
+
+
+def npu_pta(x, expert_idx, scale, offset, active_num, expert_capacity,
+            expert_num, drop_pad_mode, expert_tokens_num_type,
+            expert_tokens_num_flag, quant_mode, active_expert_range,
+            row_idx_type):
+    expanded_x, expanded_row_idx, expert_token_cumsum_or_count, expanded_scale = torch.ops._C_ascend.npu_moe_init_routing_custom(
+        x,
+        expert_idx,
+        scale=scale,
+        offset=offset,
+        active_num=active_num,
+        expert_capacity=expert_capacity,
+        expert_num=expert_num,
+        drop_pad_mode=drop_pad_mode,
+        expert_tokens_num_type=expert_tokens_num_type,
+        expert_tokens_num_flag=expert_tokens_num_flag,
+        quant_mode=quant_mode,
+        active_expert_range=active_expert_range,
+        row_idx_type=row_idx_type)
+
+    return expanded_x, expanded_row_idx, expert_token_cumsum_or_count, expanded_scale
+
+
+def cmp_out_golden(x_golden, x_out, dtype):
+    if dtype == 'int8':
+        cmp = np.isclose(x_out.cpu().numpy()[:len(x_golden)], x_golden, atol=1)
+    else:
+        cmp = np.isclose(x_out.cpu().numpy()[:len(x_golden)],
+                         x_golden,
+                         rtol=1e-05,
+                         atol=1e-05)
+    return np.all(cmp)
+
+
+def test_moe_npu(x, expert_idx, scale, offset, active_num, expert_capacity,
+                 expert_num, drop_pad_mode, expert_tokens_num_type,
+                 expert_tokens_num_flag, quant_mode, active_expert_range,
+                 row_idx_type):
+    x_npu = x.npu()
+    expert_idx_npu = expert_idx.npu()
+    scale_npu = scale.npu() if scale is not None else None
+    offset_npu = offset.npu() if offset is not None else None
+
+    x_numpy = x.numpy()
+    expert_idx_numpy = expert_idx.numpy()
+    scale_numpy = scale.numpy() if scale is not None else None
+    offset_numpy = offset.numpy() if offset is not None else None
+
+    expanded_x_golden, expanded_row_idx_golden, expert_token_cumsum_or_count_golden, expanded_scale_golden = moe_init_routing_golden(
+        x_numpy, expert_idx_numpy, scale_numpy, offset_numpy, active_num,
+        expert_capacity, expert_num, drop_pad_mode, expert_tokens_num_type,
+        expert_tokens_num_flag, active_expert_range, quant_mode, row_idx_type)
+
+    expanded_x, expanded_row_idx, expert_token_cumsum_or_count, expanded_scale = npu_pta(
+        x_npu, expert_idx_npu, scale_npu, offset_npu, active_num,
+        expert_capacity, expert_num, drop_pad_mode, expert_tokens_num_type,
+        expert_tokens_num_flag, quant_mode, active_expert_range, row_idx_type)
+    if quant_mode == -1:
+        expanded_x_result = cmp_out_golden(expanded_x_golden, expanded_x,
+                                           "float32")
+    else:
+        expanded_x_result = cmp_out_golden(expanded_x_golden, expanded_x,
+                                           "int8")
+
+    expanded_row_idx_result = cmp_out_golden(expanded_row_idx_golden,
+                                             expanded_row_idx, "int32")
+
+    if expert_tokens_num_flag:
+        expert_tokens_result = cmp_out_golden(
+            expert_token_cumsum_or_count_golden, expert_token_cumsum_or_count,
+            "int64")
+    else:
+        expert_tokens_result = True
+
+    if quant_mode == 1 or (quant_mode == -1 and scale is not None):
+        expand_scale_result = cmp_out_golden(expanded_scale_golden.flatten(),
+                                             expanded_scale, "float32")
+    else:
+        expand_scale_result = True
+
+    compare_result = expanded_x_result and expanded_row_idx_result and expert_tokens_result and expand_scale_result
+    # print('=======case result=======: ', compare_result)
+    return compare_result
+
+
+def test_moe_init_routing_custom():
+    failed_test_cnt = 0
+    drop_pad_mode = [0, 1]
+    expert_tokens_num_type = [0, 1, 2]
+    expert_tokens_num_flag = [True, False]
+    quant_mode = [0, 1, -1]
+    row_idx_type = [0, 1]
+    scale_type = [0, 1, 2]
+    product_result = itertools.product(drop_pad_mode, expert_tokens_num_type,
+                                       expert_tokens_num_flag, quant_mode,
+                                       row_idx_type, scale_type)
+
+    for idx, (drop_pad_mode_, expert_tokens_num_type_, expert_tokens_num_flag_,
+              quant_mode_, row_idx_type_,
+              scale_type_) in enumerate(product_result, 5):
+        expert_num_ = random.randint(2, 500)
+        expert_start = random.randint(0, expert_num_ - 1)
+        expert_end = random.randint(expert_start + 1, expert_num_)
+        active_expert_range_ = [expert_start, expert_end]
+
+        N = random.randint(1, 100)
+        H = random.randint(12, 100)
+        K = random.randint(1, 12)
+        x_ = torch.randn(N, H, dtype=torch.float16) * 5
+        expert_capacity_ = random.randint(1, N - 1) if N > 1 else 1
+        expert_idx_ = torch.randint(0,
+                                    expert_num_ - 1, (N, K),
+                                    dtype=torch.int32)
+        active_num_ = N * K
+
+        if drop_pad_mode_ == 1:
+            active_expert_range_ = [0, expert_num_]
+            expert_tokens_num_type_ = 1
+            row_idx_type_ = 0
+
+        if quant_mode_ == 0:
+            scale_ = torch.randn(1, dtype=torch.float)
+            offset_ = torch.randn(1, dtype=torch.float)
+        elif quant_mode_ == -1:
+            scale_ = None
+            offset_ = None
+        else:
+            if scale_type_ == 0:
+                scale_ = None
+                offset_ = None
+            elif scale_type_ == 1:
+                scale_ = torch.randn(1, H, dtype=torch.float)
+                offset_ = None
+            else:
+                scale_ = torch.randn(active_expert_range_[1] -
+                                     active_expert_range_[0],
+                                     H,
+                                     dtype=torch.float)
+            offset_ = None
+
+        result_pta = test_moe_npu(x_, expert_idx_, scale_, offset_,
+                                  active_num_, expert_capacity_, expert_num_,
+                                  drop_pad_mode_, expert_tokens_num_type_,
+                                  expert_tokens_num_flag_, quant_mode_,
+                                  active_expert_range_, row_idx_type_)
+        if not result_pta:
+            failed_test_cnt += 1
+
+    assert (failed_test_cnt == 0)