diff --git a/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h b/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h index b0063d66..15361263 100644 --- a/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h +++ b/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h @@ -22,19 +22,22 @@ extern "C" { #endif /** - * 算子功能:实现分布式MoE从InitRouting到Unpermute全部算子的融合 - * @brief aclnnDispatchFFNCombine的第一段接口,根据具体的计算流程,计算workspace大小。 + * Operator function: fuse all distributed MoE ops from InitRouting through Unpermute. + * @brief First-stage interface of aclnnDispatchFFNCombine that calculates workspace size based on the specific compute flow. * @domain aclnn_ops_infer - * @param [in] a: matmul左矩阵,数据类型支持:float16, bf16。 - * @param [in] b: matmul右矩阵,数据类型支持:float16, bf16。 - * @param [in] bias: 偏置,数据类型支持:float16, bf16。 - * @param [in] group: 标识通信域名称的字符串。 - * @param [in] worldsize: 通信域size,支持2/4/8卡。 - * @param [in] epRankId: ep本卡Id。取值范围[0, worldSize),各卡的rankId不能重复 - * @param [out] c: 计算+通信的结果,数据类型:同输入。 - * @param [out] workspaceSize: 返回需要在npu device侧申请的workspace大小。 - * @param [out] executor: 返回op执行器,包含了算子计算流程。 - * @return aclnnStatus: 返回状态码 + * @param [in] x: The input tensor. + * @param [in] weight1: The first weight tensor. + * @param [in] weight2: The second weight tensor. + * @param [in] expertId: The expert ID tensor. + * @param [in] scale1: The first scale tensor. + * @param [in] scale2: The second scale tensor. + * @param [in] probs: The probabilities tensor. + * @param [in] group: string identifying the communication domain name. + * @param [in] maxOutputSize: The maximum output size. + * @param [out] out: result of computation + communication; same dtype as input. + * @param [out] workspaceSize: workspace size to allocate on the NPU device side. + * @param [out] executor: op executor containing the operator compute flow. + * @return aclnnStatus: status code. */ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const aclTensor* weight1, const aclTensor* weight2, const aclTensor* expertId, const aclTensor* scale1, const aclTensor* scale2, @@ -44,12 +47,12 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWor uint64_t* workspaceSize, aclOpExecutor** executor); /** - * @brief aclnnDispatchGmmCombine的第二段接口,用于执行计算。 - * @param [in] workspace: 在npu device侧申请的workspace内存起址。 - * @param [in] workspace_size: 在npu device侧申请的workspace大小,由第一段接口aclnnDispatchFFNCombineGetWorkspaceSize获取。 - * @param [in] exector: op执行器,包含了算子计算流程。 - * @param [in] stream: acl stream流。 - * @return aclnnStatus: 返回状态码 + * @brief Second-stage interface of aclnnDispatchFFNCombine to execute computation. + * @param [in] workspace: workspace memory address allocated on the NPU device side. + * @param [in] workspace_size: workspace size allocated on the NPU device side, obtained from aclnnDispatchFFNCombineGetWorkspaceSize. + * @param [in] executor: op executor containing the operator compute flow. + * @param [in] stream: acl stream. + * @return aclnnStatus: status code. */ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombine(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor, aclrtStream stream); @@ -58,4 +61,4 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombine(void* } #endif -#endif // OP_API_INC_GMM_ALLTOALLV_ \ No newline at end of file +#endif // OP_API_INC_DISPATCH_FFN_COMBINE_ diff --git a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp index d487c453..649edf1f 100644 --- a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp +++ b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp @@ -56,7 +56,7 @@ class DispatchFFNCombine : public OpDef { .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); - // 输出 + // Output this->Output("out") .ParamType(REQUIRED) .DataType({ge::DT_FLOAT16, ge::DT_BF16, ge::DT_BF16}) diff --git a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp index a7f5f7ed..6342f1a1 100644 --- a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp +++ b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp @@ -27,7 +27,7 @@ using namespace AscendC; using namespace ge; namespace { - // 1. 常量定义 + // 1. Constant definitions const char *K_INNER_DEBUG = "DispatchFFNCombine Tiling Debug"; constexpr uint32_t ATTR_GROUP_INDEX = 0; constexpr uint32_t ATTR_MAX_OUTPUT_SIZE_INDEX = 1; @@ -54,13 +54,13 @@ static int32_t CeilDev(int32_t num, int32_t div) return (num + div - 1) / div; } -// 解析并校验 rankId, group, worldSize, isTransB 属性值 +// Parse and validate rankId, group, worldSize, and isTransB attributes static ge::graphStatus DispatchFFNCombineCheckAttrAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo& info) { auto attrs = context->GetAttrs(); OP_TILING_CHECK(attrs == nullptr, OP_LOGE(K_INNER_DEBUG, "attrs is null."), return ge::GRAPH_FAILED); - // todo:Attr相关tilingdata的设置、校验、打印 + // TODO: set, validate, and print tiling data related to attributes auto groupPtr = attrs->GetAttrPointer(static_cast(ATTR_GROUP_INDEX)); auto maxOutputSizePtr = attrs->GetAttrPointer(ATTR_MAX_OUTPUT_SIZE_INDEX); auto is_trans_b = attrs->GetAttrPointer(ATTR_IS_TRANS_B); @@ -87,7 +87,7 @@ static ge::graphStatus DispatchFFNCombineCheckAttrAndSetTiling(gert::TilingConte return ge::GRAPH_SUCCESS; } -// 提取输入张量 A 和 B 的形状,计算出 M、K、N 值 +// Extract shapes of input tensors A and B to compute M, K, N static ge::graphStatus DispatchFFNCombineCheckShapeAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo &info) { const char *nodeName = context->GetNodeName(); @@ -116,7 +116,7 @@ static ge::graphStatus DispatchFFNCombineCheckShapeAndSetTiling(gert::TilingCont return ge::GRAPH_SUCCESS; } -// 获取当前芯片平台的 AI Core 数目、UB 容量等硬件信息。 +// Get hardware info such as AI Core count and UB capacity for the current chip platform. static ge::graphStatus DispatchFFNCombineGetPlatformInfoAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo& info) { auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); @@ -146,9 +146,9 @@ void SetTilingData(CoCTiling &cocTilingData, DispatchFFNCombineInfo &info) cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 / 2; } -// 主调度函数: -// 获取 tilingData ➝ 检查 Attr ➝ 检查 Shape ➝ 获取平台信息 -// ➝ 调用 SetTilingData(根据rank数目) ➝ 设置 blockDim ➝ 设置 tilingKey ➝ 设置 workspace ➝ 配置通信参数 +// Main scheduling function: +// Get tilingData ➝ check Attr ➝ check Shape ➝ get platform info +// ➝ call SetTilingData (based on rank count) ➝ set blockDim ➝ set tilingKey ➝ set workspace ➝ configure communication parameters static ge::graphStatus DispatchFFNCombineTilingFuncImpl(gert::TilingContext *context) { @@ -262,4 +262,4 @@ ge::graphStatus TilingParseForDispatchFFNCombine(gert::TilingParseContext *conte IMPL_OP_OPTILING(DispatchFFNCombine) .Tiling(DispatchFFNCombineTilingFunc) .TilingParse(TilingParseForDispatchFFNCombine); -} // namespace optiling \ No newline at end of file +} // namespace optiling diff --git a/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h b/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h index 7bc4b835..827d4c5b 100644 --- a/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h +++ b/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h @@ -64,8 +64,8 @@ class HcomTopoInfo { ~HcomTopoInfo() = default; std::unordered_map rank_info_; std::mutex mutex_; - std::unordered_map group_to_ordered_stream_; // 通信域保序流 - std::unordered_map> device_id_to_group_to_ordered_stream_; // 通信域保序流 + std::unordered_map group_to_ordered_stream_; // Ordered stream for the communication domain + std::unordered_map> device_id_to_group_to_ordered_stream_; // Ordered stream for the communication domain }; } diff --git a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp index 311e2608..179cdc8e 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp +++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp @@ -8,8 +8,8 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#ifndef DISPATH_FFN_COMBINE_KERNEL_HPP -#define DISPATH_FFN_COMBINE_KERNEL_HPP +#ifndef DISPATCH_FFN_COMBINE_KERNEL_HPP +#define DISPATCH_FFN_COMBINE_KERNEL_HPP #include "kernel_operator.h" @@ -324,7 +324,7 @@ private: int64_t gmGroupOffsetC = 0; uint32_t startCoreIdx = 0; uint32_t syncGroupIdx = 0; - AscendC::CrossCoreWaitFlag<0x2>(0); // 等待aiv计算cumsumformm + AscendC::CrossCoreWaitFlag<0x2>(0); // Wait for AIV to finish cumsum for matmul int64_t preCurrentmSum = 0; int32_t syncLoopIdx = -1; for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) { @@ -364,7 +364,7 @@ private: int64_t gmOffsetA = layoutA.GetOffset(offsetA); int64_t gmOffsetB = layoutB1.GetOffset(offsetB); int64_t gmOffsetC = layoutC.GetOffset(offsetC); - int64_t gmOffsetS = groupIdx * params.problemShape.n() + blockCoord.n() * L1TileShape::N; // 每个expert一组scale + int64_t gmOffsetS = groupIdx * params.problemShape.n() + blockCoord.n() * L1TileShape::N; // One scale group per expert if (currentM > 0) { blockMmad( gmA[gmGroupOffsetA + gmOffsetA], layoutA, @@ -465,7 +465,7 @@ private: int64_t gmOffsetA = layoutA.GetOffset(offsetA); int64_t gmOffsetB = layoutB2.GetOffset(offsetB); int64_t gmOffsetC = layoutC.GetOffset(offsetC); - int64_t gmOffsetS = groupIdx * n2 + blockCoord.n() * L1TileShape::N; // 每个expert一组scale + int64_t gmOffsetS = groupIdx * n2 + blockCoord.n() * L1TileShape::N; // One scale group per expert if (currentM > 0) { blockMmad( gmPermutedToken[gmGroupOffsetA + gmOffsetA], layoutA, @@ -537,7 +537,7 @@ private: void Dispatch(Params const ¶ms) { icache_preload(8); int64_t localTokenPerExpertOffset = peermemInfo.offsetPeerTokenPerExpert + tokenPerExpertLayout(params.rank, 0, 0) * sizeof(int32_t); - GM_ADDR localTokenPerExpert = shmem() + localTokenPerExpertOffset; // 把通信矩阵全部放到peermem + GM_ADDR localTokenPerExpert = shmem() + localTokenPerExpertOffset; // Place the entire communication matrix in peermem uint32_t expandedRowIdxOffset = AlignUp(params.problemShape.m(), 256) * params.topK * sizeof(int32_t); //---initRouting------ @@ -571,7 +571,7 @@ private: int32_t syncLoopIdx = -1; BlockEpilogue1 blockEpilogue(resource); for (int32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) { - // 第i个core从第i个rank的peermem读数据 + // The ith core reads data from the ith rank's peermem groupIdxDeq = groupIdx - 2; for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) { uint32_t rowStart = (dstEpIdx == 0 ? 0 : cumsumMM((dstEpIdx - 1) * params.expertPerRank + groupIdx)) + prevGroupSum1; @@ -592,9 +592,9 @@ private: MatrixCoord offsetPeer{rowSrc, 0}; int64_t gmOffsetA = params.layoutA.GetOffset(offsetA); int64_t gmOffsetPeer = params.layoutA.GetOffset(offsetPeer); - // 通信Data + // Communication data CopyGMToGM(gmA[gmOffsetA], gmRemoteA[gmOffsetPeer], rows * params.problemShape.k(), params.ubMoveNum); - // 通信scale + // Communication scale CopyGMToGM(gmPerTokenScale1[rowStart], gmRemotePerTokenScale[rowSrc], rows, rows); } } @@ -604,7 +604,7 @@ private: AscendC::CrossCoreWaitFlag<0x2>(syncLoopIdx / 8 + 1); } AscendC::SyncAll(); - AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(0); // V通知C当前轮的通信已完成 + AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(0); // V notifies C that the current communication round is complete if ((params.epilogueGranularity < params.expertPerRank && params.epilogueGranularity > 0) && groupIdx == params.expertPerRank - 1 && prevGroupSum1 > 0) { uint32_t rowStartThisCore = 0; @@ -664,7 +664,7 @@ private: uint32_t n2 = params.problemShape.k(); uint32_t k2 = params.problemShape.n() / 2; - // TODO 计算tokenperexpert的cumsum + // TODO compute the cumsum of tokenPerExpert typename BlockEpilogue2::Params epilogueParams{ static_cast(params.EP), static_cast(params.expertPerRank), @@ -774,10 +774,10 @@ private: CATLASS_DEVICE PeermemInfo(const Params & params, const HcclShmem & shmem) { - offsetA = 0; // 占用1/3的BUFFSIZE - offsetPeerPerTokenScale = offsetA + AlignUp(shmem.SegmentSize() / 3, 512); // 占用1MB - offsetD = offsetPeerPerTokenScale + MB_SIZE; // 占用剩下的 - offsetPeerTokenPerExpert = shmem.SegmentSize() - 2 * MB_SIZE; // 占用最后2MB + offsetA = 0; // Occupies one third of BUFFSIZE + offsetPeerPerTokenScale = offsetA + AlignUp(shmem.SegmentSize() / 3, 512); // Occupies 1 MB + offsetD = offsetPeerPerTokenScale + MB_SIZE; // Occupies the remaining space + offsetPeerTokenPerExpert = shmem.SegmentSize() - 2 * MB_SIZE; // Occupies the final 2 MB } }; @@ -811,4 +811,4 @@ private: } // namespace Catlass::Gemm::Kernel -#endif // DISPATH_FFN_COMBINE_KERNEL_HPP \ No newline at end of file +#endif // DISPATH_FFN_COMBINE_KERNEL_HPP diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp index 8453c810..811b2ce9 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp @@ -80,7 +80,7 @@ __aicore__ inline void moe_init_routing_quant_v2( sortPipe.Destroy(); } - if (tilingKey == 10000 || tilingKey == 10010 || tilingKey ==11000 || tilingKey ==11010) { //没有drop的情况 + if (tilingKey == 10000 || tilingKey == 10010 || tilingKey ==11000 || tilingKey ==11010) { // No drop scenario if (tilingData->expertTokensCountOrCumsumFlag != EXERPT_TOKENS_NONE) { TPipe expertTokenOutPipe; MoeV2ExpertTokenOut expertTokenOutOp; @@ -94,7 +94,7 @@ __aicore__ inline void moe_init_routing_quant_v2( srcToDstOp.Init(expandedRowIdx, workspace, tilingData, &srcToDstPipe); srcToDstOp.Process(); srcToDstPipe.Destroy(); - } else if (tilingKey ==10100 || tilingKey ==10110 || tilingKey ==11100 || tilingKey ==11110) { //有drop的情况 + } else if (tilingKey ==10100 || tilingKey ==10110 || tilingKey ==11100 || tilingKey ==11110) { // Drop scenario TPipe expertTokenOutPipe; MoeV2ExpertTokenOut expertTokenOutOp; expertTokenOutOp.Init(expertTokensCountOrCumsum, expertTokensBeforeCapacity, diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h index 37130b17..02dee8f8 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h @@ -178,7 +178,7 @@ uint64_t InnerMoeInitRoutingV2TilingBase::GetTilingKey() const { return TILING_KEY_HIGH_PERFORMANCE; } if (dropPadMode == 0) { - if (totalLength <= sortLoopMaxElement) { // 排序只用到一个核排序 + if (totalLength <= sortLoopMaxElement) { // Sorting uses only one core return TILING_KEY_DROPLESS_SORT_ONE_CORE; } else { return TILING_KEY_DROPLESS_SORT_MULTI_CORE; @@ -206,10 +206,10 @@ bool InnerMoeInitRoutingV2TilingBase::GetShapeAttrsInfo(int64_t m, int64_t cols, this->expertTokensCountOrCumsumFlag = expertTokensCountOrCumsumFlag; this->expertTokensBeforeCapacityFlag = expertTokensBeforeCapacityFlag; if (dropPadMode == 1) { - // droppad场景下不输出expertTokensCountOrCumsum + // Do not output expertTokensCountOrCumsum in drop-pad mode expertTokensCountOrCumsumFlag = 0; } else { - // dropless场景下不输出expertTokensBeforeCapacity + // Do not output expertTokensBeforeCapacity in dropless mode expertTokensBeforeCapacityFlag = false; } moeInitRoutingTilingData.cols = cols; @@ -235,8 +235,8 @@ bool InnerMoeInitRoutingV2TilingBase::GetPlatformInfo(int64_t aivCoreNum, int64_ bool InnerMoeInitRoutingV2TilingBase::GetWorkspaceSize() { - // 计算workspace大小 - size_t sortWorkspaceSize = totalLength * sizeof(float) * NUM_TWO * NUM_THREE; // 排序需要的空间 + // Calculate workspace size + size_t sortWorkspaceSize = totalLength * sizeof(float) * NUM_TWO * NUM_THREE; // Space needed for sorting size_t scatterWorkspaceSize = totalLength * sizeof(int32_t) * NUM_TWO; size_t expertTokenFlagSize = aivNum * 2 * sizeof(int32_t); workspaceSize_ = sortWorkspaceSize + scatterWorkspaceSize + expertTokenFlagSize + SIZE_16 * LENGTH_1024 * LENGTH_1024; @@ -257,11 +257,11 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSOneCoreCompute(InnerMoeV2VBSComp void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSComputeTilingData* tilingData) { //Tiling4VBSMultiCoreCompute - int64_t needCoreNum = CeilDiv(totalLength, sortLoopMaxElement); // 向上取整 + int64_t needCoreNum = CeilDiv(totalLength, sortLoopMaxElement); // Round up needCoreNum = static_cast(std::pow(4, CeilLog4(needCoreNum))); - needCoreNum = std::min(needCoreNum, aivNum); // 不能超过物理核数 + needCoreNum = std::min(needCoreNum, aivNum); // Cannot exceed physical core count if (needCoreNum > 0) { - int64_t perCoreElements = totalLength / needCoreNum; // 每个核处理的元素数 + int64_t perCoreElements = totalLength / needCoreNum; // Elements handled per core int64_t alineFloorPerCoreElements = perCoreElements - perCoreElements % SORT32_ALIGN_ELEMENT; int64_t lastCoreElement = totalLength - (needCoreNum - 1) * alineFloorPerCoreElements; int64_t alineCeilPerCoreElements = perCoreElements + SORT32_ALIGN_ELEMENT - perCoreElements % SORT32_ALIGN_ELEMENT; @@ -274,7 +274,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSCo tilingData->needCoreNum = needCoreNum; do { tilingData->perCoreElements = perCoreElements; - tilingData->perCoreLoops = CeilDiv(tilingData->perCoreElements, sortLoopMaxElement); // 每个核处理的loop数 + tilingData->perCoreLoops = CeilDiv(tilingData->perCoreElements, sortLoopMaxElement); // Loops handled per core tilingData->perCorePerLoopElements = std::min(tilingData->perCoreElements, sortLoopMaxElement); tilingData->perCoreLastLoopElements = tilingData->perCoreElements - (tilingData->perCoreLoops - 1) * tilingData->perCorePerLoopElements; tilingData->lastCoreElements = totalLength - (tilingData->needCoreNum - 1) * tilingData->perCoreElements; @@ -294,7 +294,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSCo void InnerMoeInitRoutingV2TilingBase::Tiling4VBSCompute() { auto tilingData = &moeInitRoutingTilingData.vbsComputeParamsOp; tilingData->oneLoopMaxElements = sortLoopMaxElement; - if (totalLength <= sortLoopMaxElement) { // 只用到一个核 + if (totalLength <= sortLoopMaxElement) { // Only one core is used Tiling4VBSOneCoreCompute(tilingData); return; } @@ -304,11 +304,11 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSCompute() { void InnerMoeInitRoutingV2TilingBase::Tiling4VMSMiddleCompute() { auto vbsComputeTilingData = &moeInitRoutingTilingData.vbsComputeParamsOp; auto tilingData = &moeInitRoutingTilingData.vmsMiddleComputeParamsOp; - if (vbsComputeTilingData->needCoreNum <= MRG_LIST_NUM) { // 队列数小于一次vms则没有中间归并 - tilingData->needCoreNum = 0; // 需要的核数 + if (vbsComputeTilingData->needCoreNum <= MRG_LIST_NUM) { // No intermediate merge if queue count fits one VMS + tilingData->needCoreNum = 0; // Required core count } else { int64_t needCoreNum = CeilDiv(vbsComputeTilingData->needCoreNum, MRG_LIST_NUM); - tilingData->needCoreNum = needCoreNum; // 需要的核数 + tilingData->needCoreNum = needCoreNum; // Required core count } } @@ -333,7 +333,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4SrcToDstCompute() { tilingData->needCoreNum = needCoreNum; int64_t lastCoreNum = totalLength - perCoreRows * (tilingData->needCoreNum - 1); tilingData->perCoreRows = perCoreRows; - if (perLoopMaxRows >= tilingData->perCoreRows) { // 一个loop结束 + if (perLoopMaxRows >= tilingData->perCoreRows) { // One loop completes tilingData->perCorePerLoopRows = tilingData->perCoreRows; tilingData->perCoreLastLoopRows = tilingData->perCoreRows; } else { @@ -407,4 +407,4 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4SrcToDstCapacityCompute() { } } -} \ No newline at end of file +} diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h index fe057656..ac4c1f95 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h @@ -151,7 +151,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCumsum(bool isTai return; } #ifdef __CCE_KT_TEST__ - // CPU孪生调试无法使用多核同步,可能导致index为未初始化的脏数据,因此需要特殊处理 + // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially if (this->firstExpertId > expertTokensCountOrCumsumGm.GetSize()) { return; } @@ -202,7 +202,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCount(bool isTail int64_t copyLength = isTail ? this->lastExpertId - this->firstExpertId + 1 : this->expertNumUbAlign; DataCopyExtParams copyParams{static_cast(1), static_cast(copyLength * sizeof(int32_t)), 0, 0, 0}; #ifdef __CCE_KT_TEST__ - // CPU孪生调试不进行输出拷贝 + // CPU twin debugging skips output copies return; #endif SetAtomicAdd(); @@ -307,4 +307,4 @@ __aicore__ inline void MoeV2ExpertTokenOut::Process() { } } // namespace MoeInitRoutingQuantV2 -#endif // INNER_MOE_V2_EXPERT_TOKEN_OUT_H \ No newline at end of file +#endif // INNER_MOE_V2_EXPERT_TOKEN_OUT_H diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h index 9f3ef220..924e8548 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h @@ -184,7 +184,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant::CopyOutXQuant1H(int64_t progr inputXInQueue.EnQue(inLocal); - // 计算quant + // Compute quantization Compute(smoothLocal); LocalTensor quantScaleLocal = scaleOutQueue.DeQue(); @@ -525,7 +525,7 @@ template __aicore__ inline void MoeV2GatherDynamicQuant::Process() { if (this->blockIdx < this->needCoreNum) { currentLoopRows = perLoopRows; - if (colLoops > 1) { // 一行无法全载,需要workspace + if (colLoops > 1) { // A single row cannot be fully loaded; workspace is required if (smoothType == 2) { for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) { CopyInExpandedExpertIdx(loop); @@ -543,7 +543,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant::Process() { CopyInExpandedRowIdx(this->rowLoops - 1); CopyOutPartialXQuant1H(this->rowLoops - 1); } - } else { // 一行可以全载 + } else { // A single row can be fully loaded if (smoothType == 2) { for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) { CopyInExpandedExpertIdx(loop); diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h index bbdb1338..76427276 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h @@ -111,7 +111,7 @@ __aicore__ inline void MoeV2GatherOut::CopyOut(int64_t progress) { } outOffset = outIndex * cols + colsLoop * this->perLoopCols; #ifdef __CCE_KT_TEST__ - // CPU孪生调试无法使用多核同步,可能导致index为未初始化的脏数据,因此需要特殊处理 + // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially if (outOffset > expandedXGm.GetSize()) { continue; } diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h index 850e66b0..770163fc 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h +++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h @@ -132,7 +132,7 @@ __aicore__ inline void MoeV2SrcToDstWithCapacity::CopyOut(int64_t col = this->lastLoopCols; } #ifdef __CCE_KT_TEST__ - // CPU孪生调试无法使用多核同步,可能导致index为未初始化的脏数据,因此需要特殊处理 + // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially if (index * this->cols + i * this->perLoopCols + col * sizeof(T) > expandedXGm.GetSize()) { continue; } @@ -266,4 +266,4 @@ __aicore__ inline void MoeV2SrcToDstWithCapacity::Process() { this->SyncAll(); } } // namespace MoeInitRoutingQuantV2 -#endif // INNER_MOE_V2_SRC_TO_DST_WITH_CAPACITY_H \ No newline at end of file +#endif // INNER_MOE_V2_SRC_TO_DST_WITH_CAPACITY_H diff --git a/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h b/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h index 12b35a29..1255b5cf 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h +++ b/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h @@ -107,7 +107,7 @@ KernelMoeTokenUnpermute::Init(GM_ADDR permuted_tokens, GM_ADD this->tokens_splited_num = tiling_data->tokens_splited_num; this->tokens_splited_remain = tiling_data->tokens_splited_remain; - // 处理token_by_core尾块 + // Handle the tail block for token_by_core if (this->tokens_core_remain > 0 && blockIdx < this->tokens_core_remain) { this->tokens_core_length += 1; this->tokens_splited_remain += 1; @@ -181,7 +181,7 @@ __aicore__ inline void KernelMoeTokenUnpermute::Process() for (int64_t i = 0; i < this->tokens_splited_num; ++i) { CalMultiOutToken(i * this->tokens_splited_length, this->tokens_splited_length); } - // 处理tokens_num不能均匀分核数的尾块 + // Handle the tail block when tokens_num is not evenly divisible by core count if (this->tokens_splited_remain > 0) { CalMultiOutToken(this->tokens_splited_num * this->tokens_splited_length, this->tokens_splited_remain); } @@ -231,7 +231,7 @@ __aicore__ inline void KernelMoeTokenUnpermute::CalSingleOutT for (int64_t h_index = 0; h_index < this->hidden_splited_num; ++h_index) { CalPartOutToken(start_token, h_index, this->hidden_splited_length, out_token_idx); } - // 一次不能完整容纳完整的hidden_size, 处理尾块 + // Handle the tail block when a full hidden_size does not fit in one pass if (this->hidden_splited_remain > 0) { CalPartOutToken(start_token, this->hidden_splited_num, this->hidden_splited_remain, out_token_idx); } @@ -248,7 +248,7 @@ KernelMoeTokenUnpermute::CalPartOutToken(const int64_t start_ int64_t end_token = start_token + this->top_k; T2 cal_token_idx = this->indicesLocal.GetValue(start_token); - // 处理第一个Token数据 + // Handle the first token if (cal_token_idx < this->num_out_tokens) { float probsValue = 0; if constexpr (PROBS) { @@ -263,7 +263,7 @@ KernelMoeTokenUnpermute::CalPartOutToken(const int64_t start_ Duplicate(this->token_tensor0, static_cast(0), h_length); } - // 处理剩余的Token数据 + // Handle the remaining tokens for (int64_t token_index = start_token + 1; token_index < end_token; ++token_index) { cal_token_idx = this->indicesLocal.GetValue(token_index); if (cal_token_idx < this->num_out_tokens) { @@ -278,7 +278,7 @@ KernelMoeTokenUnpermute::CalPartOutToken(const int64_t start_ } } - // 输出计算结果 + // Write out the computed result CopyOut(out_token_index, h_index, h_length); } diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp index 8bdd017d..4b627a67 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp +++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp @@ -146,27 +146,27 @@ public: auto gmTileD = gmD[loopIdx * blockN]; LayoutC layoutUbC{1, blockN}; - // 把C从GM workspace搬到UB + // Move C from GM workspace to UB AscendC::WaitFlag(eventUbCVMTE2List[ubListId]); copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC); AscendC::SetFlag(eventUbCMTE2VList[ubListId]); - //在UB上做把C cast成FP32 + // Cast C to FP32 in UB AscendC::WaitFlag(eventUbCMTE2VList[ubListId]); AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN); AscendC::SetFlag(eventUbCVMTE2List[ubListId]); - // 获取pertoken scale值,gmPerTokenScale的第loopIdx行 + // Get per-token scale from row loopIdx of gmPerTokenScale ElementPerTokenScale perTokenScale = gmPerTokenScale(loopIdx); AscendC::SetFlag(0); AscendC::WaitFlag(0); - // pertoken scale值与FP32的C做Muls乘法 + // Multiply FP32 C by the per-token scale AscendC::PipeBarrier(); AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN); AscendC::PipeBarrier(); - // 将muls结果转回fp16/bf16 + // Cast the muls result back to fp16/bf16 LayoutD layoutUbD{1, blockN}; AscendC::WaitFlag(eventUbDMTE3VList[ubListId]); diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp index adca19f6..26630122 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp +++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp @@ -140,7 +140,7 @@ public: { params = params_; } - // 每个tile就是1*7168,每个block是一个expert的所有token=[group[i], 7168] + // Each tile is 1x7168, and each block covers all tokens for one expert = [group[i], 7168] CATLASS_DEVICE void operator() ( AscendC::GlobalTensor const &gmC, @@ -200,39 +200,39 @@ public: auto gmTileD = gmD[loopIdx * ChunkTileLen]; LayoutC layoutUbC{1, blockN}; - // 把C从GM workspace搬到UB + // Move C from GM workspace to UB AscendC::WaitFlag(eventUbCVMTE2List[ubListId]); copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC); AscendC::SetFlag(eventUbCMTE2VList[ubListId]); - // 在UB上做把C cast成FP32 + // Cast C to FP32 in UB AscendC::WaitFlag(eventUbCMTE2VList[ubListId]); AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN); AscendC::SetFlag(eventUbCVMTE2List[ubListId]); - // 获取pertoken scale值,gmPerTokenScale的第loopIdx行 + // Get per-token scale from row loopIdx of gmPerTokenScale ElementPerTokenScale perTokenScale = gmPerTokenScale1(loopIdx); AscendC::SetFlag(0); AscendC::WaitFlag(0); - // pertoken scale值与FP32的C做Muls乘法 + // Multiply FP32 C by the per-token scale AscendC::PipeBarrier(); AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN); AscendC::PipeBarrier(); - //swiglue计算过程 + // Swiglu computation process AscendC::Muls(ubCFp32ChunkN, ubCFp32, -1.0f, ChunkTileLen); AscendC::PipeBarrier(); AscendC::Exp(ubCFp32ChunkN, ubCFp32ChunkN, ChunkTileLen); AscendC::PipeBarrier(); AscendC::Adds(ubCFp32ChunkN, ubCFp32ChunkN, 1.0f, ChunkTileLen); AscendC::PipeBarrier(); - //TODO除的时候是否会对之后的数据有影响; + // TODO: confirm whether the division impacts subsequent data AscendC::Div(ubCFp32ChunkN, ubCFp32, ubCFp32ChunkN, ChunkTileLen); AscendC::PipeBarrier(); AscendC::Mul(ubCFp32ChunkN, ubCFp32ChunkN, ubCFp32[ChunkTileLen], ChunkTileLen); - //quant过程,两种方式区别; + // Quantization process; difference between the two approaches AscendC::PipeBarrier(); AscendC::Abs(ubAbs, ubCFp32ChunkN, ChunkTileLen); AscendC::PipeBarrier(); @@ -243,7 +243,7 @@ public: AscendC::SetFlag(0); AscendC::WaitFlag(0); - //TODO两种计算方法的效率比较 + // TODO: compare the efficiency of the two calculation methods ElementPerTokenScale GMubDequantScale = ubReduceMax.GetValue(0); AscendC::SetFlag(0); diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp index b66268fb..fd2b995c 100644 --- a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp +++ b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp @@ -56,7 +56,7 @@ FORCE_INLINE_AICORE int32_t gm_signal_wait_until_eq_for_barrier(__gm__ int32_t * constexpr int32_t MAX_RANK_SIZE = 32; class HcclShmem { public: - #ifdef HCCL_COMM // hccl需要初始化hccl context + #ifdef HCCL_COMM // HCCL needs to initialize the HCCL context __gm__ HcclOpResParamCustom *WinContext_{nullptr}; Hccl hccl_; GM_ADDR m_ptrArray[MAX_RANK_SIZE]; @@ -92,7 +92,7 @@ public: #endif FORCE_INLINE_AICORE - GM_ADDR operator() () const { // 无参数,返回本地peermem + GM_ADDR operator() () const { // No argument: return local peermem #ifdef HCCL_COMM return m_ptrArray[m_rank]; #else @@ -101,7 +101,7 @@ public: } FORCE_INLINE_AICORE - GM_ADDR operator() (int32_t index) const { // 带index参数,返回远端peermem首地址 + GM_ADDR operator() (int32_t index) const { // With index: return remote peermem base address #ifdef HCCL_COMM return m_ptrArray[index]; #else @@ -126,22 +126,6 @@ public: #endif } - // FORCE_INLINE_AICORE - // GM_ADDR operator () (GM_ADDR ptr, int32_t index) const { // shmem_ptr相同用法 - // #ifdef HCCL_COMM - // size_t offset = ptr - m_ptrArray[m_rank]; - // if (offset < 0 || offset >= m_segmentSize) { - // return nullptr; - // } - // if (index < 0 || index >= m_rankSize) { - // return nullptr; - // } - // return m_ptrArray[index] + offset; - // #else - // return shmem_ptr(ptr, index); - // #endif - // } - FORCE_INLINE_AICORE ~HcclShmem() { diff --git a/docs/source/installation.md b/docs/source/installation.md index 3f803413..3aa42944 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -157,6 +157,7 @@ cd .. # Install vLLM Ascend. git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git cd vllm-ascend +git submodule update --init --recursive pip install -v -e . cd .. ```