Update comment doc (#4731)

### What this PR does / why we need it? Translate remaining Chinese comments in the `dispatch_ffn_combine` code to English and update the installation guide to remind users to initialize submodules when building from source. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: mojave2 <chenchen145@huawei.com> Signed-off-by: Chen Chen <0109chenchen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-05 15:07:31 +08:00
parent b32ef53b3b
commit 7f33838e6e
16 changed files with 100 additions and 112 deletions
--- a/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
+++ b/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
@@ -22,19 +22,22 @@ extern "C" {
 #endif

 /**
- * 算子功能：实现分布式MoE从InitRouting到Unpermute全部算子的融合
- * @brief aclnnDispatchFFNCombine的第一段接口，根据具体的计算流程，计算workspace大小。
+ * Operator function: fuse all distributed MoE ops from InitRouting through Unpermute.
+ * @brief First-stage interface of aclnnDispatchFFNCombine that calculates workspace size based on the specific compute flow.
 * @domain aclnn_ops_infer
- * @param [in] a: matmul左矩阵，数据类型支持：float16, bf16。
- * @param [in] b: matmul右矩阵，数据类型支持：float16, bf16。
- * @param [in] bias: 偏置，数据类型支持：float16, bf16。
- * @param [in] group: 标识通信域名称的字符串。
- * @param [in] worldsize: 通信域size，支持2/4/8卡。
- * @param [in] epRankId: ep本卡Id。取值范围[0, worldSize)，各卡的rankId不能重复
- * @param [out] c: 计算+通信的结果，数据类型：同输入。
- * @param [out] workspaceSize: 返回需要在npu device侧申请的workspace大小。
- * @param [out] executor: 返回op执行器，包含了算子计算流程。
- * @return aclnnStatus: 返回状态码
+ * @param [in] x: The input tensor.
+ * @param [in] weight1: The first weight tensor.
+ * @param [in] weight2: The second weight tensor.
+ * @param [in] expertId: The expert ID tensor.
+ * @param [in] scale1: The first scale tensor.
+ * @param [in] scale2: The second scale tensor.
+ * @param [in] probs: The probabilities tensor.
+ * @param [in] group: string identifying the communication domain name.
+ * @param [in] maxOutputSize: The maximum output size.
+ * @param [out] out: result of computation + communication; same dtype as input.
+ * @param [out] workspaceSize: workspace size to allocate on the NPU device side.
+ * @param [out] executor: op executor containing the operator compute flow.
+ * @return aclnnStatus: status code.
 */
 __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const aclTensor* weight1, const aclTensor* weight2,
                                                                                        const aclTensor* expertId, const aclTensor* scale1, const aclTensor* scale2,
@@ -44,12 +47,12 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWor
                                                                                        uint64_t* workspaceSize, aclOpExecutor** executor);

 /**
- * @brief aclnnDispatchGmmCombine的第二段接口，用于执行计算。
- * @param [in] workspace: 在npu device侧申请的workspace内存起址。
- * @param [in] workspace_size: 在npu device侧申请的workspace大小，由第一段接口aclnnDispatchFFNCombineGetWorkspaceSize获取。
- * @param [in] exector: op执行器，包含了算子计算流程。
- * @param [in] stream: acl stream流。
- * @return aclnnStatus: 返回状态码
+ * @brief Second-stage interface of aclnnDispatchFFNCombine to execute computation.
+ * @param [in] workspace: workspace memory address allocated on the NPU device side.
+ * @param [in] workspace_size: workspace size allocated on the NPU device side, obtained from aclnnDispatchFFNCombineGetWorkspaceSize.
+ * @param [in] executor: op executor containing the operator compute flow.
+ * @param [in] stream: acl stream.
+ * @return aclnnStatus: status code.
 */
 __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombine(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
                                           aclrtStream stream);
@@ -58,4 +61,4 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombine(void*
 }
 #endif

-#endif  // OP_API_INC_GMM_ALLTOALLV_
+#endif  // OP_API_INC_DISPATCH_FFN_COMBINE_
--- a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
+++ b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
@@ -56,7 +56,7 @@ class DispatchFFNCombine : public OpDef {
        .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
        .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});

-    // 输出
+    // Output
    this->Output("out")
        .ParamType(REQUIRED)
        .DataType({ge::DT_FLOAT16, ge::DT_BF16, ge::DT_BF16})
--- a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp
+++ b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp
@@ -27,7 +27,7 @@ using namespace AscendC;
 using namespace ge;

 namespace {
-    // 1. 常量定义
+    // 1. Constant definitions
    const char *K_INNER_DEBUG = "DispatchFFNCombine Tiling Debug";
    constexpr uint32_t ATTR_GROUP_INDEX = 0;
    constexpr uint32_t ATTR_MAX_OUTPUT_SIZE_INDEX = 1;
@@ -54,13 +54,13 @@ static int32_t CeilDev(int32_t num, int32_t div)
    return (num + div - 1) / div;
 }

-// 解析并校验 rankId, group, worldSize, isTransB 属性值
+// Parse and validate rankId, group, worldSize, and isTransB attributes
 static ge::graphStatus DispatchFFNCombineCheckAttrAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo& info)
 {
    auto attrs = context->GetAttrs();
    OP_TILING_CHECK(attrs == nullptr, OP_LOGE(K_INNER_DEBUG, "attrs is null."), return ge::GRAPH_FAILED);

-    // todo：Attr相关tilingdata的设置、校验、打印
+    // TODO: set, validate, and print tiling data related to attributes
    auto groupPtr = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_GROUP_INDEX));
    auto maxOutputSizePtr = attrs->GetAttrPointer<int>(ATTR_MAX_OUTPUT_SIZE_INDEX);
    auto is_trans_b = attrs->GetAttrPointer<bool>(ATTR_IS_TRANS_B);
@@ -87,7 +87,7 @@ static ge::graphStatus DispatchFFNCombineCheckAttrAndSetTiling(gert::TilingConte
    return ge::GRAPH_SUCCESS;
 }

-// 提取输入张量 A 和 B 的形状，计算出 M、K、N 值
+// Extract shapes of input tensors A and B to compute M, K, N
 static ge::graphStatus DispatchFFNCombineCheckShapeAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo &info)
 {
    const char *nodeName = context->GetNodeName();
@@ -116,7 +116,7 @@ static ge::graphStatus DispatchFFNCombineCheckShapeAndSetTiling(gert::TilingCont
    return ge::GRAPH_SUCCESS;
 }

-// 获取当前芯片平台的 AI Core 数目、UB 容量等硬件信息。
+// Get hardware info such as AI Core count and UB capacity for the current chip platform.
 static ge::graphStatus DispatchFFNCombineGetPlatformInfoAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo& info)
 {
    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
@@ -146,9 +146,9 @@ void SetTilingData(CoCTiling &cocTilingData, DispatchFFNCombineInfo &info)
    cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 / 2;
 }

-// 主调度函数：
-// 获取 tilingData ➝ 检查 Attr ➝ 检查 Shape ➝ 获取平台信息 
-// ➝ 调用 SetTilingData（根据rank数目） ➝ 设置 blockDim ➝ 设置 tilingKey ➝ 设置 workspace ➝ 配置通信参数
+// Main scheduling function:
+// Get tilingData ➝ check Attr ➝ check Shape ➝ get platform info 
+// ➝ call SetTilingData (based on rank count) ➝ set blockDim ➝ set tilingKey ➝ set workspace ➝ configure communication parameters

 static ge::graphStatus DispatchFFNCombineTilingFuncImpl(gert::TilingContext *context)
 {
@@ -262,4 +262,4 @@ ge::graphStatus TilingParseForDispatchFFNCombine(gert::TilingParseContext *conte
 IMPL_OP_OPTILING(DispatchFFNCombine)
    .Tiling(DispatchFFNCombineTilingFunc)
    .TilingParse<DispatchFFNCombineCompileInfo>(TilingParseForDispatchFFNCombine);
-} // namespace optiling
+} // namespace optiling
--- a/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h
+++ b/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h
@@ -64,8 +64,8 @@ class HcomTopoInfo {
  ~HcomTopoInfo() = default;
  std::unordered_map<std::string, TopoInfo> rank_info_;
  std::mutex mutex_;
-  std::unordered_map<std::string, void*> group_to_ordered_stream_; // 通信域保序流
-  std::unordered_map<int32_t, std::unordered_map<std::string, void*>> device_id_to_group_to_ordered_stream_; // 通信域保序流
+  std::unordered_map<std::string, void*> group_to_ordered_stream_; // Ordered stream for the communication domain
+  std::unordered_map<int32_t, std::unordered_map<std::string, void*>> device_id_to_group_to_ordered_stream_; // Ordered stream for the communication domain
 };
 }

--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
@@ -8,8 +8,8 @@
 * See LICENSE in the root of the software repository for the full text of the License.
 */

-#ifndef DISPATH_FFN_COMBINE_KERNEL_HPP
-#define DISPATH_FFN_COMBINE_KERNEL_HPP
+#ifndef DISPATCH_FFN_COMBINE_KERNEL_HPP
+#define DISPATCH_FFN_COMBINE_KERNEL_HPP

 #include "kernel_operator.h"

@@ -324,7 +324,7 @@ private:
        int64_t gmGroupOffsetC = 0;
        uint32_t startCoreIdx = 0;
        uint32_t syncGroupIdx = 0;
-        AscendC::CrossCoreWaitFlag<0x2>(0); // 等待aiv计算cumsumformm
+        AscendC::CrossCoreWaitFlag<0x2>(0); // Wait for AIV to finish cumsum for matmul
        int64_t preCurrentmSum = 0;
        int32_t syncLoopIdx = -1;
        for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
@@ -364,7 +364,7 @@ private:
                int64_t gmOffsetA = layoutA.GetOffset(offsetA);
                int64_t gmOffsetB = layoutB1.GetOffset(offsetB);
                int64_t gmOffsetC = layoutC.GetOffset(offsetC);
-                int64_t gmOffsetS = groupIdx * params.problemShape.n() + blockCoord.n() * L1TileShape::N;   // 每个expert一组scale
+                int64_t gmOffsetS = groupIdx * params.problemShape.n() + blockCoord.n() * L1TileShape::N;   // One scale group per expert
                if (currentM > 0) {
                    blockMmad(
                        gmA[gmGroupOffsetA + gmOffsetA], layoutA,
@@ -465,7 +465,7 @@ private:
                int64_t gmOffsetA = layoutA.GetOffset(offsetA);
                int64_t gmOffsetB = layoutB2.GetOffset(offsetB);
                int64_t gmOffsetC = layoutC.GetOffset(offsetC);
-                int64_t gmOffsetS = groupIdx * n2 + blockCoord.n() * L1TileShape::N;   // 每个expert一组scale
+                int64_t gmOffsetS = groupIdx * n2 + blockCoord.n() * L1TileShape::N;   // One scale group per expert
                if (currentM > 0) {
                    blockMmad(
                        gmPermutedToken[gmGroupOffsetA + gmOffsetA], layoutA,
@@ -537,7 +537,7 @@ private:
    void Dispatch(Params const &params) {
        icache_preload(8);
        int64_t localTokenPerExpertOffset = peermemInfo.offsetPeerTokenPerExpert + tokenPerExpertLayout(params.rank, 0, 0) * sizeof(int32_t);
-        GM_ADDR localTokenPerExpert = shmem() + localTokenPerExpertOffset;     // 把通信矩阵全部放到peermem
+        GM_ADDR localTokenPerExpert = shmem() + localTokenPerExpertOffset;     // Place the entire communication matrix in peermem
        uint32_t expandedRowIdxOffset = AlignUp(params.problemShape.m(), 256) * params.topK * sizeof(int32_t);

        //---initRouting------
@@ -571,7 +571,7 @@ private:
        int32_t syncLoopIdx = -1;
        BlockEpilogue1 blockEpilogue(resource);
        for (int32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
-            // 第i个core从第i个rank的peermem读数据
+            // The ith core reads data from the ith rank's peermem
            groupIdxDeq = groupIdx - 2;
            for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
                uint32_t rowStart = (dstEpIdx == 0 ? 0 : cumsumMM((dstEpIdx - 1) * params.expertPerRank + groupIdx)) + prevGroupSum1;
@@ -592,9 +592,9 @@ private:
                    MatrixCoord offsetPeer{rowSrc, 0};
                    int64_t gmOffsetA = params.layoutA.GetOffset(offsetA);
                    int64_t gmOffsetPeer = params.layoutA.GetOffset(offsetPeer);
-                    // 通信Data
+                    // Communication data
                    CopyGMToGM(gmA[gmOffsetA], gmRemoteA[gmOffsetPeer], rows * params.problemShape.k(), params.ubMoveNum);
-                    // 通信scale
+                    // Communication scale
                    CopyGMToGM(gmPerTokenScale1[rowStart], gmRemotePerTokenScale[rowSrc], rows, rows);
                }
            }
@@ -604,7 +604,7 @@ private:
                AscendC::CrossCoreWaitFlag<0x2>(syncLoopIdx / 8 + 1);
            }
            AscendC::SyncAll<true>();
-            AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(0);   // V通知C当前轮的通信已完成
+            AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(0);   // V notifies C that the current communication round is complete
            
            if ((params.epilogueGranularity < params.expertPerRank && params.epilogueGranularity > 0) && groupIdx == params.expertPerRank - 1 && prevGroupSum1 > 0) {
                uint32_t rowStartThisCore = 0;
@@ -664,7 +664,7 @@ private:
        uint32_t n2 = params.problemShape.k();
        uint32_t k2 = params.problemShape.n() / 2;

-        // TODO 计算tokenperexpert的cumsum
+        // TODO compute the cumsum of tokenPerExpert
        typename BlockEpilogue2::Params epilogueParams{
            static_cast<int32_t>(params.EP),
            static_cast<int32_t>(params.expertPerRank),
@@ -774,10 +774,10 @@ private:

        CATLASS_DEVICE
        PeermemInfo(const Params & params, const HcclShmem & shmem) {
-            offsetA = 0;    // 占用1/3的BUFFSIZE
-            offsetPeerPerTokenScale = offsetA + AlignUp(shmem.SegmentSize() / 3, 512); // 占用1MB
-            offsetD = offsetPeerPerTokenScale + MB_SIZE;    // 占用剩下的
-            offsetPeerTokenPerExpert = shmem.SegmentSize() - 2 * MB_SIZE;     // 占用最后2MB
+            offsetA = 0;    // Occupies one third of BUFFSIZE
+            offsetPeerPerTokenScale = offsetA + AlignUp(shmem.SegmentSize() / 3, 512); // Occupies 1 MB
+            offsetD = offsetPeerPerTokenScale + MB_SIZE;    // Occupies the remaining space
+            offsetPeerTokenPerExpert = shmem.SegmentSize() - 2 * MB_SIZE;     // Occupies the final 2 MB
        }
    };

@@ -811,4 +811,4 @@ private:

 } // namespace Catlass::Gemm::Kernel

-#endif // DISPATH_FFN_COMBINE_KERNEL_HPP
+#endif // DISPATH_FFN_COMBINE_KERNEL_HPP
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp
@@ -80,7 +80,7 @@ __aicore__ inline  void moe_init_routing_quant_v2(
    sortPipe.Destroy();
  }

-  if (tilingKey == 10000 || tilingKey == 10010 || tilingKey ==11000 || tilingKey ==11010) { //没有drop的情况
+  if (tilingKey == 10000 || tilingKey == 10010 || tilingKey ==11000 || tilingKey ==11010) { // No drop scenario
    if (tilingData->expertTokensCountOrCumsumFlag != EXERPT_TOKENS_NONE) {
      TPipe expertTokenOutPipe;
      MoeV2ExpertTokenOut expertTokenOutOp;
@@ -94,7 +94,7 @@ __aicore__ inline  void moe_init_routing_quant_v2(
    srcToDstOp.Init<MoeInitRoutingQuantV2TilingData>(expandedRowIdx, workspace, tilingData, &srcToDstPipe);
    srcToDstOp.Process();
    srcToDstPipe.Destroy();
-  } else if (tilingKey ==10100 || tilingKey ==10110 || tilingKey ==11100 || tilingKey ==11110) { //有drop的情况
+  } else if (tilingKey ==10100 || tilingKey ==10110 || tilingKey ==11100 || tilingKey ==11110) { // Drop scenario
    TPipe expertTokenOutPipe;
    MoeV2ExpertTokenOut expertTokenOutOp;
    expertTokenOutOp.Init<MoeInitRoutingQuantV2TilingData>(expertTokensCountOrCumsum, expertTokensBeforeCapacity,
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h
@@ -178,7 +178,7 @@ uint64_t InnerMoeInitRoutingV2TilingBase::GetTilingKey() const {
    return TILING_KEY_HIGH_PERFORMANCE;
  }
  if (dropPadMode == 0) {
-    if (totalLength <= sortLoopMaxElement) {  // 排序只用到一个核排序
+    if (totalLength <= sortLoopMaxElement) {  // Sorting uses only one core
      return TILING_KEY_DROPLESS_SORT_ONE_CORE;
    } else {
      return TILING_KEY_DROPLESS_SORT_MULTI_CORE;
@@ -206,10 +206,10 @@ bool InnerMoeInitRoutingV2TilingBase::GetShapeAttrsInfo(int64_t m, int64_t cols,
  this->expertTokensCountOrCumsumFlag = expertTokensCountOrCumsumFlag;
  this->expertTokensBeforeCapacityFlag = expertTokensBeforeCapacityFlag;
  if (dropPadMode == 1) {
-    // droppad场景下不输出expertTokensCountOrCumsum
+    // Do not output expertTokensCountOrCumsum in drop-pad mode
    expertTokensCountOrCumsumFlag = 0;
  } else {
-    // dropless场景下不输出expertTokensBeforeCapacity
+    // Do not output expertTokensBeforeCapacity in dropless mode
    expertTokensBeforeCapacityFlag = false;
  }
  moeInitRoutingTilingData.cols = cols;
@@ -235,8 +235,8 @@ bool InnerMoeInitRoutingV2TilingBase::GetPlatformInfo(int64_t aivCoreNum, int64_


 bool InnerMoeInitRoutingV2TilingBase::GetWorkspaceSize() {
-  // 计算workspace大小
-  size_t sortWorkspaceSize = totalLength * sizeof(float) * NUM_TWO * NUM_THREE;  // 排序需要的空间
+  // Calculate workspace size
+  size_t sortWorkspaceSize = totalLength * sizeof(float) * NUM_TWO * NUM_THREE;  // Space needed for sorting
  size_t scatterWorkspaceSize = totalLength * sizeof(int32_t) * NUM_TWO;
  size_t expertTokenFlagSize = aivNum * 2 * sizeof(int32_t);
  workspaceSize_ = sortWorkspaceSize + scatterWorkspaceSize + expertTokenFlagSize + SIZE_16 * LENGTH_1024 * LENGTH_1024;
@@ -257,11 +257,11 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSOneCoreCompute(InnerMoeV2VBSComp

 void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSComputeTilingData* tilingData) {
      //Tiling4VBSMultiCoreCompute
-      int64_t needCoreNum = CeilDiv(totalLength, sortLoopMaxElement);  // 向上取整
+      int64_t needCoreNum = CeilDiv(totalLength, sortLoopMaxElement);  // Round up
      needCoreNum = static_cast<int64_t>(std::pow(4, CeilLog4(needCoreNum)));
-      needCoreNum = std::min(needCoreNum, aivNum);                     // 不能超过物理核数
+      needCoreNum = std::min(needCoreNum, aivNum);                     // Cannot exceed physical core count
      if (needCoreNum > 0) {
-          int64_t perCoreElements = totalLength / needCoreNum;  // 每个核处理的元素数
+          int64_t perCoreElements = totalLength / needCoreNum;  // Elements handled per core
          int64_t alineFloorPerCoreElements = perCoreElements - perCoreElements % SORT32_ALIGN_ELEMENT;
          int64_t lastCoreElement = totalLength - (needCoreNum - 1) * alineFloorPerCoreElements;
          int64_t alineCeilPerCoreElements = perCoreElements + SORT32_ALIGN_ELEMENT - perCoreElements % SORT32_ALIGN_ELEMENT;
@@ -274,7 +274,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSCo
          tilingData->needCoreNum = needCoreNum;
          do {
              tilingData->perCoreElements = perCoreElements;
-              tilingData->perCoreLoops = CeilDiv(tilingData->perCoreElements, sortLoopMaxElement);  // 每个核处理的loop数
+              tilingData->perCoreLoops = CeilDiv(tilingData->perCoreElements, sortLoopMaxElement);  // Loops handled per core
              tilingData->perCorePerLoopElements = std::min(tilingData->perCoreElements, sortLoopMaxElement);
              tilingData->perCoreLastLoopElements = tilingData->perCoreElements - (tilingData->perCoreLoops - 1) * tilingData->perCorePerLoopElements;
              tilingData->lastCoreElements = totalLength - (tilingData->needCoreNum - 1) * tilingData->perCoreElements;
@@ -294,7 +294,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSCo
 void InnerMoeInitRoutingV2TilingBase::Tiling4VBSCompute() {
  auto tilingData = &moeInitRoutingTilingData.vbsComputeParamsOp;
  tilingData->oneLoopMaxElements = sortLoopMaxElement;
-  if (totalLength <= sortLoopMaxElement) {  // 只用到一个核
+  if (totalLength <= sortLoopMaxElement) {  // Only one core is used
    Tiling4VBSOneCoreCompute(tilingData);
    return;
  }
@@ -304,11 +304,11 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSCompute() {
 void InnerMoeInitRoutingV2TilingBase::Tiling4VMSMiddleCompute() {
  auto vbsComputeTilingData = &moeInitRoutingTilingData.vbsComputeParamsOp;
  auto tilingData = &moeInitRoutingTilingData.vmsMiddleComputeParamsOp;
-  if (vbsComputeTilingData->needCoreNum <= MRG_LIST_NUM) {  // 队列数小于一次vms则没有中间归并
-      tilingData->needCoreNum = 0;                               // 需要的核数
+  if (vbsComputeTilingData->needCoreNum <= MRG_LIST_NUM) {  // No intermediate merge if queue count fits one VMS
+      tilingData->needCoreNum = 0;                               // Required core count
  } else {
      int64_t needCoreNum = CeilDiv(vbsComputeTilingData->needCoreNum, MRG_LIST_NUM);
-      tilingData->needCoreNum = needCoreNum;  // 需要的核数
+      tilingData->needCoreNum = needCoreNum;  // Required core count
  }
 }

@@ -333,7 +333,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4SrcToDstCompute() {
  tilingData->needCoreNum = needCoreNum;
  int64_t lastCoreNum = totalLength - perCoreRows * (tilingData->needCoreNum - 1);
  tilingData->perCoreRows = perCoreRows;
-  if (perLoopMaxRows >= tilingData->perCoreRows) {  // 一个loop结束
+  if (perLoopMaxRows >= tilingData->perCoreRows) {  // One loop completes
      tilingData->perCorePerLoopRows = tilingData->perCoreRows;
      tilingData->perCoreLastLoopRows = tilingData->perCoreRows;
  } else {
@@ -407,4 +407,4 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4SrcToDstCapacityCompute() {
  }
 }

-}
+}
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h
@@ -151,7 +151,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCumsum(bool isTai
    return;
  }
 #ifdef __CCE_KT_TEST__
-    // CPU孪生调试无法使用多核同步，可能导致index为未初始化的脏数据，因此需要特殊处理
+    // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially
    if (this->firstExpertId > expertTokensCountOrCumsumGm.GetSize()) {
        return;
    }
@@ -202,7 +202,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCount(bool isTail
  int64_t copyLength = isTail ? this->lastExpertId - this->firstExpertId + 1 : this->expertNumUbAlign;
  DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(copyLength * sizeof(int32_t)), 0, 0, 0};
 #ifdef __CCE_KT_TEST__
-    // CPU孪生调试不进行输出拷贝
+    // CPU twin debugging skips output copies
    return;
 #endif
  SetAtomicAdd<int32_t>();
@@ -307,4 +307,4 @@ __aicore__ inline void MoeV2ExpertTokenOut::Process() {
 }

 }  // namespace MoeInitRoutingQuantV2
-#endif  // INNER_MOE_V2_EXPERT_TOKEN_OUT_H
+#endif  // INNER_MOE_V2_EXPERT_TOKEN_OUT_H
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
@@ -184,7 +184,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr

    inputXInQueue.EnQue<T>(inLocal);

-    // 计算quant
+    // Compute quantization
    Compute(smoothLocal);

    LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
@@ -525,7 +525,7 @@ template <typename T>
 __aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() {
  if (this->blockIdx < this->needCoreNum) {
    currentLoopRows = perLoopRows;
-    if (colLoops > 1) {  // 一行无法全载，需要workspace
+    if (colLoops > 1) {  // A single row cannot be fully loaded; workspace is required
      if (smoothType == 2) {
        for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
          CopyInExpandedExpertIdx(loop);
@@ -543,7 +543,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() {
        CopyInExpandedRowIdx(this->rowLoops - 1);
        CopyOutPartialXQuant1H(this->rowLoops - 1);
      }
-    } else {  // 一行可以全载
+    } else {  // A single row can be fully loaded
      if (smoothType == 2) {
        for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
          CopyInExpandedExpertIdx(loop);
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h
@@ -111,7 +111,7 @@ __aicore__ inline void MoeV2GatherOut<T>::CopyOut(int64_t progress) {
        }
        outOffset = outIndex * cols + colsLoop * this->perLoopCols;
 #ifdef __CCE_KT_TEST__
-        // CPU孪生调试无法使用多核同步，可能导致index为未初始化的脏数据，因此需要特殊处理
+        // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially
        if (outOffset > expandedXGm.GetSize()) {
            continue;
        }
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h
@@ -132,7 +132,7 @@ __aicore__ inline void MoeV2SrcToDstWithCapacity<T, TilingData>::CopyOut(int64_t
            col = this->lastLoopCols;
          }
 #ifdef __CCE_KT_TEST__
-          // CPU孪生调试无法使用多核同步，可能导致index为未初始化的脏数据，因此需要特殊处理
+          // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially
          if (index * this->cols + i * this->perLoopCols + col * sizeof(T) > expandedXGm.GetSize()) {
              continue;
          }
@@ -266,4 +266,4 @@ __aicore__ inline void MoeV2SrcToDstWithCapacity<T, TilingData>::Process() {
  this->SyncAll();
 }
 }  // namespace MoeInitRoutingQuantV2
-#endif  // INNER_MOE_V2_SRC_TO_DST_WITH_CAPACITY_H
+#endif  // INNER_MOE_V2_SRC_TO_DST_WITH_CAPACITY_H
--- a/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h
@@ -107,7 +107,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::Init(GM_ADDR permuted_tokens, GM_ADD
    this->tokens_splited_num = tiling_data->tokens_splited_num;
    this->tokens_splited_remain = tiling_data->tokens_splited_remain;

-    // 处理token_by_core尾块
+    // Handle the tail block for token_by_core
    if (this->tokens_core_remain > 0 && blockIdx < this->tokens_core_remain) {
        this->tokens_core_length += 1;
        this->tokens_splited_remain += 1;
@@ -181,7 +181,7 @@ __aicore__ inline void KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::Process()
    for (int64_t i = 0; i < this->tokens_splited_num; ++i) {
        CalMultiOutToken(i * this->tokens_splited_length, this->tokens_splited_length);
    }
-    // 处理tokens_num不能均匀分核数的尾块
+    // Handle the tail block when tokens_num is not evenly divisible by core count
    if (this->tokens_splited_remain > 0) {
        CalMultiOutToken(this->tokens_splited_num * this->tokens_splited_length, this->tokens_splited_remain);
    }
@@ -231,7 +231,7 @@ __aicore__ inline void KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalSingleOutT
    for (int64_t h_index = 0; h_index < this->hidden_splited_num; ++h_index) {
        CalPartOutToken(start_token, h_index, this->hidden_splited_length, out_token_idx);
    }
-    // 一次不能完整容纳完整的hidden_size, 处理尾块
+    // Handle the tail block when a full hidden_size does not fit in one pass
    if (this->hidden_splited_remain > 0) {
        CalPartOutToken(start_token, this->hidden_splited_num, this->hidden_splited_remain, out_token_idx);
    }
@@ -248,7 +248,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalPartOutToken(const int64_t start_
    int64_t end_token = start_token + this->top_k;
    T2 cal_token_idx = this->indicesLocal.GetValue(start_token);

-    // 处理第一个Token数据
+    // Handle the first token
    if (cal_token_idx < this->num_out_tokens) {
        float probsValue = 0;
        if constexpr (PROBS) {
@@ -263,7 +263,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalPartOutToken(const int64_t start_
        Duplicate(this->token_tensor0, static_cast<float>(0), h_length);
    }

-    // 处理剩余的Token数据
+    // Handle the remaining tokens
    for (int64_t token_index = start_token + 1; token_index < end_token; ++token_index) {
        cal_token_idx = this->indicesLocal.GetValue(token_index);
        if (cal_token_idx < this->num_out_tokens) {
@@ -278,7 +278,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalPartOutToken(const int64_t start_
        }
    }

-    // 输出计算结果
+    // Write out the computed result
    CopyOut(out_token_index, h_index, h_length);
 }

--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
@@ -146,27 +146,27 @@ public:
            auto gmTileD = gmD[loopIdx * blockN];
            LayoutC layoutUbC{1, blockN};

-            // 把C从GM workspace搬到UB
+            // Move C from GM workspace to UB
            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
            copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);

-            //在UB上做把C cast成FP32
+            // Cast C to FP32 in UB
            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
            AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);

-            // 获取pertoken scale值，gmPerTokenScale的第loopIdx行
+            // Get per-token scale from row loopIdx of gmPerTokenScale
            ElementPerTokenScale perTokenScale = gmPerTokenScale(loopIdx);

            AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
            AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
-            // pertoken scale值与FP32的C做Muls乘法
+            // Multiply FP32 C by the per-token scale
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
            AscendC::PipeBarrier<PIPE_V>();

-            // 将muls结果转回fp16/bf16
+            // Cast the muls result back to fp16/bf16
            LayoutD layoutUbD{1, blockN};
            AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[ubListId]);

--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp
@@ -140,7 +140,7 @@ public:
    {
        params = params_;
    }
-    // 每个tile就是1*7168，每个block是一个expert的所有token=[group[i], 7168]
+    // Each tile is 1x7168, and each block covers all tokens for one expert = [group[i], 7168]
    CATLASS_DEVICE
    void operator() (
        AscendC::GlobalTensor<ElementC> const &gmC,
@@ -200,39 +200,39 @@ public:
            auto gmTileD = gmD[loopIdx * ChunkTileLen];
            LayoutC layoutUbC{1, blockN};

-            // 把C从GM workspace搬到UB
+            // Move C from GM workspace to UB
            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
            copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);

-            // 在UB上做把C cast成FP32
+            // Cast C to FP32 in UB
            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
            AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);

-            // 获取pertoken scale值，gmPerTokenScale的第loopIdx行
+            // Get per-token scale from row loopIdx of gmPerTokenScale
            ElementPerTokenScale perTokenScale = gmPerTokenScale1(loopIdx);

            AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
            AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
-            // pertoken scale值与FP32的C做Muls乘法
+            // Multiply FP32 C by the per-token scale
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
            AscendC::PipeBarrier<PIPE_V>();

-            //swiglue计算过程
+            // Swiglu computation process
            AscendC::Muls(ubCFp32ChunkN, ubCFp32, -1.0f, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Exp(ubCFp32ChunkN, ubCFp32ChunkN, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Adds(ubCFp32ChunkN, ubCFp32ChunkN, 1.0f, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
-            //TODO除的时候是否会对之后的数据有影响；
+            // TODO: confirm whether the division impacts subsequent data
            AscendC::Div(ubCFp32ChunkN, ubCFp32, ubCFp32ChunkN, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Mul(ubCFp32ChunkN, ubCFp32ChunkN, ubCFp32[ChunkTileLen], ChunkTileLen);
            
-            //quant过程，两种方式区别；
+            // Quantization process; difference between the two approaches
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Abs(ubAbs, ubCFp32ChunkN, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
@@ -243,7 +243,7 @@ public:
            AscendC::SetFlag<AscendC::HardEvent::V_S>(0);
            AscendC::WaitFlag<AscendC::HardEvent::V_S>(0);

-            //TODO两种计算方法的效率比较
+            // TODO: compare the efficiency of the two calculation methods
            ElementPerTokenScale GMubDequantScale = ubReduceMax.GetValue(0);
            AscendC::SetFlag<AscendC::HardEvent::S_V>(0);

--- a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
@@ -56,7 +56,7 @@ FORCE_INLINE_AICORE int32_t gm_signal_wait_until_eq_for_barrier(__gm__ int32_t *
 constexpr int32_t MAX_RANK_SIZE = 32;
 class HcclShmem {
 public:
-    #ifdef HCCL_COMM    // hccl需要初始化hccl context
+    #ifdef HCCL_COMM    // HCCL needs to initialize the HCCL context
    __gm__ HcclOpResParamCustom *WinContext_{nullptr};
    Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
    GM_ADDR m_ptrArray[MAX_RANK_SIZE];
@@ -92,7 +92,7 @@ public:
    #endif

    FORCE_INLINE_AICORE
-    GM_ADDR operator() () const {   // 无参数，返回本地peermem
+    GM_ADDR operator() () const {   // No argument: return local peermem
        #ifdef HCCL_COMM
            return m_ptrArray[m_rank];
        #else
@@ -101,7 +101,7 @@ public:
    }

    FORCE_INLINE_AICORE
-    GM_ADDR operator() (int32_t index) const {  // 带index参数，返回远端peermem首地址
+    GM_ADDR operator() (int32_t index) const {  // With index: return remote peermem base address
        #ifdef HCCL_COMM
            return m_ptrArray[index];
        #else
@@ -126,22 +126,6 @@ public:
        #endif
    }

-        // FORCE_INLINE_AICORE
-    // GM_ADDR operator () (GM_ADDR ptr, int32_t index) const  {   // shmem_ptr相同用法
-    //     #ifdef HCCL_COMM
-    //         size_t offset = ptr - m_ptrArray[m_rank];
-    //         if (offset < 0 || offset >= m_segmentSize) {
-    //             return nullptr;
-    //         }
-    //         if (index < 0 || index >= m_rankSize) {
-    //             return nullptr;
-    //         }
-    //         return m_ptrArray[index] + offset;
-    //     #else
-    //         return shmem_ptr(ptr, index);
-    //     #endif
-    // }
-

    FORCE_INLINE_AICORE
    ~HcclShmem() {
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -157,6 +157,7 @@ cd ..
 # Install vLLM Ascend.
 git clone  --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git
 cd vllm-ascend
+git submodule update --init --recursive
 pip install -v -e .
 cd ..
 ```