diff --git a/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h b/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
index b0063d66..15361263 100644
--- a/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
+++ b/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
@@ -22,19 +22,22 @@ extern "C" {
 #endif
 
 /**
- * 算子功能：实现分布式MoE从InitRouting到Unpermute全部算子的融合
- * @brief aclnnDispatchFFNCombine的第一段接口，根据具体的计算流程，计算workspace大小。
+ * Operator function: fuse all distributed MoE ops from InitRouting through Unpermute.
+ * @brief First-stage interface of aclnnDispatchFFNCombine that calculates workspace size based on the specific compute flow.
  * @domain aclnn_ops_infer
- * @param [in] a: matmul左矩阵，数据类型支持：float16, bf16。
- * @param [in] b: matmul右矩阵，数据类型支持：float16, bf16。
- * @param [in] bias: 偏置，数据类型支持：float16, bf16。
- * @param [in] group: 标识通信域名称的字符串。
- * @param [in] worldsize: 通信域size，支持2/4/8卡。
- * @param [in] epRankId: ep本卡Id。取值范围[0, worldSize)，各卡的rankId不能重复
- * @param [out] c: 计算+通信的结果，数据类型：同输入。
- * @param [out] workspaceSize: 返回需要在npu device侧申请的workspace大小。
- * @param [out] executor: 返回op执行器，包含了算子计算流程。
- * @return aclnnStatus: 返回状态码
+ * @param [in] x: The input tensor.
+ * @param [in] weight1: The first weight tensor.
+ * @param [in] weight2: The second weight tensor.
+ * @param [in] expertId: The expert ID tensor.
+ * @param [in] scale1: The first scale tensor.
+ * @param [in] scale2: The second scale tensor.
+ * @param [in] probs: The probabilities tensor.
+ * @param [in] group: string identifying the communication domain name.
+ * @param [in] maxOutputSize: The maximum output size.
+ * @param [out] out: result of computation + communication; same dtype as input.
+ * @param [out] workspaceSize: workspace size to allocate on the NPU device side.
+ * @param [out] executor: op executor containing the operator compute flow.
+ * @return aclnnStatus: status code.
  */
 __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const aclTensor* weight1, const aclTensor* weight2,
                                                                                         const aclTensor* expertId, const aclTensor* scale1, const aclTensor* scale2,
@@ -44,12 +47,12 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWor
                                                                                         uint64_t* workspaceSize, aclOpExecutor** executor);
 
 /**
- * @brief aclnnDispatchGmmCombine的第二段接口，用于执行计算。
- * @param [in] workspace: 在npu device侧申请的workspace内存起址。
- * @param [in] workspace_size: 在npu device侧申请的workspace大小，由第一段接口aclnnDispatchFFNCombineGetWorkspaceSize获取。
- * @param [in] exector: op执行器，包含了算子计算流程。
- * @param [in] stream: acl stream流。
- * @return aclnnStatus: 返回状态码
+ * @brief Second-stage interface of aclnnDispatchFFNCombine to execute computation.
+ * @param [in] workspace: workspace memory address allocated on the NPU device side.
+ * @param [in] workspace_size: workspace size allocated on the NPU device side, obtained from aclnnDispatchFFNCombineGetWorkspaceSize.
+ * @param [in] executor: op executor containing the operator compute flow.
+ * @param [in] stream: acl stream.
+ * @return aclnnStatus: status code.
  */
 __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombine(void* workspace, uint64_t workspaceSize, aclOpExecutor* executor,
                                            aclrtStream stream);
@@ -58,4 +61,4 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombine(void*
 }
 #endif
 
-#endif  // OP_API_INC_GMM_ALLTOALLV_
\ No newline at end of file
+#endif  // OP_API_INC_DISPATCH_FFN_COMBINE_
diff --git a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
index d487c453..649edf1f 100644
--- a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
+++ b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
@@ -56,7 +56,7 @@ class DispatchFFNCombine : public OpDef {
         .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
         .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
 
-    // 输出
+    // Output
     this->Output("out")
         .ParamType(REQUIRED)
         .DataType({ge::DT_FLOAT16, ge::DT_BF16, ge::DT_BF16})
diff --git a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp
index a7f5f7ed..6342f1a1 100644
--- a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp
+++ b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_tiling.cpp
@@ -27,7 +27,7 @@ using namespace AscendC;
 using namespace ge;
 
 namespace {
-    // 1. 常量定义
+    // 1. Constant definitions
     const char *K_INNER_DEBUG = "DispatchFFNCombine Tiling Debug";
     constexpr uint32_t ATTR_GROUP_INDEX = 0;
     constexpr uint32_t ATTR_MAX_OUTPUT_SIZE_INDEX = 1;
@@ -54,13 +54,13 @@ static int32_t CeilDev(int32_t num, int32_t div)
     return (num + div - 1) / div;
 }
 
-// 解析并校验 rankId, group, worldSize, isTransB 属性值
+// Parse and validate rankId, group, worldSize, and isTransB attributes
 static ge::graphStatus DispatchFFNCombineCheckAttrAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo& info)
 {
     auto attrs = context->GetAttrs();
     OP_TILING_CHECK(attrs == nullptr, OP_LOGE(K_INNER_DEBUG, "attrs is null."), return ge::GRAPH_FAILED);
 
-    // todo：Attr相关tilingdata的设置、校验、打印
+    // TODO: set, validate, and print tiling data related to attributes
     auto groupPtr = attrs->GetAttrPointer<char>(static_cast<int>(ATTR_GROUP_INDEX));
     auto maxOutputSizePtr = attrs->GetAttrPointer<int>(ATTR_MAX_OUTPUT_SIZE_INDEX);
     auto is_trans_b = attrs->GetAttrPointer<bool>(ATTR_IS_TRANS_B);
@@ -87,7 +87,7 @@ static ge::graphStatus DispatchFFNCombineCheckAttrAndSetTiling(gert::TilingConte
     return ge::GRAPH_SUCCESS;
 }
 
-// 提取输入张量 A 和 B 的形状，计算出 M、K、N 值
+// Extract shapes of input tensors A and B to compute M, K, N
 static ge::graphStatus DispatchFFNCombineCheckShapeAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo &info)
 {
     const char *nodeName = context->GetNodeName();
@@ -116,7 +116,7 @@ static ge::graphStatus DispatchFFNCombineCheckShapeAndSetTiling(gert::TilingCont
     return ge::GRAPH_SUCCESS;
 }
 
-// 获取当前芯片平台的 AI Core 数目、UB 容量等硬件信息。
+// Get hardware info such as AI Core count and UB capacity for the current chip platform.
 static ge::graphStatus DispatchFFNCombineGetPlatformInfoAndSetTiling(gert::TilingContext *context, DispatchFFNCombineInfo& info)
 {
     auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
@@ -146,9 +146,9 @@ void SetTilingData(CoCTiling &cocTilingData, DispatchFFNCombineInfo &info)
     cocTilingData.lenPerLoop = cocTilingData.m0 * cocTilingData.n0 / 2;
 }
 
-// 主调度函数：
-// 获取 tilingData ➝ 检查 Attr ➝ 检查 Shape ➝ 获取平台信息 
-// ➝ 调用 SetTilingData（根据rank数目） ➝ 设置 blockDim ➝ 设置 tilingKey ➝ 设置 workspace ➝ 配置通信参数
+// Main scheduling function:
+// Get tilingData ➝ check Attr ➝ check Shape ➝ get platform info 
+// ➝ call SetTilingData (based on rank count) ➝ set blockDim ➝ set tilingKey ➝ set workspace ➝ configure communication parameters
 
 static ge::graphStatus DispatchFFNCombineTilingFuncImpl(gert::TilingContext *context)
 {
@@ -262,4 +262,4 @@ ge::graphStatus TilingParseForDispatchFFNCombine(gert::TilingParseContext *conte
 IMPL_OP_OPTILING(DispatchFFNCombine)
     .Tiling(DispatchFFNCombineTilingFunc)
     .TilingParse<DispatchFFNCombineCompileInfo>(TilingParseForDispatchFFNCombine);
-} // namespace optiling
\ No newline at end of file
+} // namespace optiling
diff --git a/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h b/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h
index 7bc4b835..827d4c5b 100644
--- a/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h
+++ b/csrc/dispatch_ffn_combine/op_host/hcom_topo_info.h
@@ -64,8 +64,8 @@ class HcomTopoInfo {
   ~HcomTopoInfo() = default;
   std::unordered_map<std::string, TopoInfo> rank_info_;
   std::mutex mutex_;
-  std::unordered_map<std::string, void*> group_to_ordered_stream_; // 通信域保序流
-  std::unordered_map<int32_t, std::unordered_map<std::string, void*>> device_id_to_group_to_ordered_stream_; // 通信域保序流
+  std::unordered_map<std::string, void*> group_to_ordered_stream_; // Ordered stream for the communication domain
+  std::unordered_map<int32_t, std::unordered_map<std::string, void*>> device_id_to_group_to_ordered_stream_; // Ordered stream for the communication domain
 };
 }
 
diff --git a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
index 311e2608..179cdc8e 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine_kernel.hpp
@@ -8,8 +8,8 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
-#ifndef DISPATH_FFN_COMBINE_KERNEL_HPP
-#define DISPATH_FFN_COMBINE_KERNEL_HPP
+#ifndef DISPATCH_FFN_COMBINE_KERNEL_HPP
+#define DISPATCH_FFN_COMBINE_KERNEL_HPP
 
 #include "kernel_operator.h"
 
@@ -324,7 +324,7 @@ private:
         int64_t gmGroupOffsetC = 0;
         uint32_t startCoreIdx = 0;
         uint32_t syncGroupIdx = 0;
-        AscendC::CrossCoreWaitFlag<0x2>(0); // 等待aiv计算cumsumformm
+        AscendC::CrossCoreWaitFlag<0x2>(0); // Wait for AIV to finish cumsum for matmul
         int64_t preCurrentmSum = 0;
         int32_t syncLoopIdx = -1;
         for (uint32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
@@ -364,7 +364,7 @@ private:
                 int64_t gmOffsetA = layoutA.GetOffset(offsetA);
                 int64_t gmOffsetB = layoutB1.GetOffset(offsetB);
                 int64_t gmOffsetC = layoutC.GetOffset(offsetC);
-                int64_t gmOffsetS = groupIdx * params.problemShape.n() + blockCoord.n() * L1TileShape::N;   // 每个expert一组scale
+                int64_t gmOffsetS = groupIdx * params.problemShape.n() + blockCoord.n() * L1TileShape::N;   // One scale group per expert
                 if (currentM > 0) {
                     blockMmad(
                         gmA[gmGroupOffsetA + gmOffsetA], layoutA,
@@ -465,7 +465,7 @@ private:
                 int64_t gmOffsetA = layoutA.GetOffset(offsetA);
                 int64_t gmOffsetB = layoutB2.GetOffset(offsetB);
                 int64_t gmOffsetC = layoutC.GetOffset(offsetC);
-                int64_t gmOffsetS = groupIdx * n2 + blockCoord.n() * L1TileShape::N;   // 每个expert一组scale
+                int64_t gmOffsetS = groupIdx * n2 + blockCoord.n() * L1TileShape::N;   // One scale group per expert
                 if (currentM > 0) {
                     blockMmad(
                         gmPermutedToken[gmGroupOffsetA + gmOffsetA], layoutA,
@@ -537,7 +537,7 @@ private:
     void Dispatch(Params const &params) {
         icache_preload(8);
         int64_t localTokenPerExpertOffset = peermemInfo.offsetPeerTokenPerExpert + tokenPerExpertLayout(params.rank, 0, 0) * sizeof(int32_t);
-        GM_ADDR localTokenPerExpert = shmem() + localTokenPerExpertOffset;     // 把通信矩阵全部放到peermem
+        GM_ADDR localTokenPerExpert = shmem() + localTokenPerExpertOffset;     // Place the entire communication matrix in peermem
         uint32_t expandedRowIdxOffset = AlignUp(params.problemShape.m(), 256) * params.topK * sizeof(int32_t);
 
         //---initRouting------
@@ -571,7 +571,7 @@ private:
         int32_t syncLoopIdx = -1;
         BlockEpilogue1 blockEpilogue(resource);
         for (int32_t groupIdx = 0; groupIdx < params.expertPerRank; ++groupIdx) {
-            // 第i个core从第i个rank的peermem读数据
+            // The ith core reads data from the ith rank's peermem
             groupIdxDeq = groupIdx - 2;
             for(int32_t dstEpIdx = coreIdx; dstEpIdx < params.EP; dstEpIdx += coreNum) {
                 uint32_t rowStart = (dstEpIdx == 0 ? 0 : cumsumMM((dstEpIdx - 1) * params.expertPerRank + groupIdx)) + prevGroupSum1;
@@ -592,9 +592,9 @@ private:
                     MatrixCoord offsetPeer{rowSrc, 0};
                     int64_t gmOffsetA = params.layoutA.GetOffset(offsetA);
                     int64_t gmOffsetPeer = params.layoutA.GetOffset(offsetPeer);
-                    // 通信Data
+                    // Communication data
                     CopyGMToGM(gmA[gmOffsetA], gmRemoteA[gmOffsetPeer], rows * params.problemShape.k(), params.ubMoveNum);
-                    // 通信scale
+                    // Communication scale
                     CopyGMToGM(gmPerTokenScale1[rowStart], gmRemotePerTokenScale[rowSrc], rows, rows);
                 }
             }
@@ -604,7 +604,7 @@ private:
                 AscendC::CrossCoreWaitFlag<0x2>(syncLoopIdx / 8 + 1);
             }
             AscendC::SyncAll<true>();
-            AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(0);   // V通知C当前轮的通信已完成
+            AscendC::CrossCoreSetFlag<0x2, PIPE_MTE3>(0);   // V notifies C that the current communication round is complete
             
             if ((params.epilogueGranularity < params.expertPerRank && params.epilogueGranularity > 0) && groupIdx == params.expertPerRank - 1 && prevGroupSum1 > 0) {
                 uint32_t rowStartThisCore = 0;
@@ -664,7 +664,7 @@ private:
         uint32_t n2 = params.problemShape.k();
         uint32_t k2 = params.problemShape.n() / 2;
 
-        // TODO 计算tokenperexpert的cumsum
+        // TODO compute the cumsum of tokenPerExpert
         typename BlockEpilogue2::Params epilogueParams{
             static_cast<int32_t>(params.EP),
             static_cast<int32_t>(params.expertPerRank),
@@ -774,10 +774,10 @@ private:
 
         CATLASS_DEVICE
         PeermemInfo(const Params & params, const HcclShmem & shmem) {
-            offsetA = 0;    // 占用1/3的BUFFSIZE
-            offsetPeerPerTokenScale = offsetA + AlignUp(shmem.SegmentSize() / 3, 512); // 占用1MB
-            offsetD = offsetPeerPerTokenScale + MB_SIZE;    // 占用剩下的
-            offsetPeerTokenPerExpert = shmem.SegmentSize() - 2 * MB_SIZE;     // 占用最后2MB
+            offsetA = 0;    // Occupies one third of BUFFSIZE
+            offsetPeerPerTokenScale = offsetA + AlignUp(shmem.SegmentSize() / 3, 512); // Occupies 1 MB
+            offsetD = offsetPeerPerTokenScale + MB_SIZE;    // Occupies the remaining space
+            offsetPeerTokenPerExpert = shmem.SegmentSize() - 2 * MB_SIZE;     // Occupies the final 2 MB
         }
     };
 
@@ -811,4 +811,4 @@ private:
 
 } // namespace Catlass::Gemm::Kernel
 
-#endif // DISPATH_FFN_COMBINE_KERNEL_HPP
\ No newline at end of file
+#endif // DISPATH_FFN_COMBINE_KERNEL_HPP
diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp
index 8453c810..811b2ce9 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_quant_v2.cpp
@@ -80,7 +80,7 @@ __aicore__ inline  void moe_init_routing_quant_v2(
     sortPipe.Destroy();
   }
 
-  if (tilingKey == 10000 || tilingKey == 10010 || tilingKey ==11000 || tilingKey ==11010) { //没有drop的情况
+  if (tilingKey == 10000 || tilingKey == 10010 || tilingKey ==11000 || tilingKey ==11010) { // No drop scenario
     if (tilingData->expertTokensCountOrCumsumFlag != EXERPT_TOKENS_NONE) {
       TPipe expertTokenOutPipe;
       MoeV2ExpertTokenOut expertTokenOutOp;
@@ -94,7 +94,7 @@ __aicore__ inline  void moe_init_routing_quant_v2(
     srcToDstOp.Init<MoeInitRoutingQuantV2TilingData>(expandedRowIdx, workspace, tilingData, &srcToDstPipe);
     srcToDstOp.Process();
     srcToDstPipe.Destroy();
-  } else if (tilingKey ==10100 || tilingKey ==10110 || tilingKey ==11100 || tilingKey ==11110) { //有drop的情况
+  } else if (tilingKey ==10100 || tilingKey ==10110 || tilingKey ==11100 || tilingKey ==11110) { // Drop scenario
     TPipe expertTokenOutPipe;
     MoeV2ExpertTokenOut expertTokenOutOp;
     expertTokenOutOp.Init<MoeInitRoutingQuantV2TilingData>(expertTokensCountOrCumsum, expertTokensBeforeCapacity,
diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h
index 37130b17..02dee8f8 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_init_routing_v2_tiling.h
@@ -178,7 +178,7 @@ uint64_t InnerMoeInitRoutingV2TilingBase::GetTilingKey() const {
     return TILING_KEY_HIGH_PERFORMANCE;
   }
   if (dropPadMode == 0) {
-    if (totalLength <= sortLoopMaxElement) {  // 排序只用到一个核排序
+    if (totalLength <= sortLoopMaxElement) {  // Sorting uses only one core
       return TILING_KEY_DROPLESS_SORT_ONE_CORE;
     } else {
       return TILING_KEY_DROPLESS_SORT_MULTI_CORE;
@@ -206,10 +206,10 @@ bool InnerMoeInitRoutingV2TilingBase::GetShapeAttrsInfo(int64_t m, int64_t cols,
   this->expertTokensCountOrCumsumFlag = expertTokensCountOrCumsumFlag;
   this->expertTokensBeforeCapacityFlag = expertTokensBeforeCapacityFlag;
   if (dropPadMode == 1) {
-    // droppad场景下不输出expertTokensCountOrCumsum
+    // Do not output expertTokensCountOrCumsum in drop-pad mode
     expertTokensCountOrCumsumFlag = 0;
   } else {
-    // dropless场景下不输出expertTokensBeforeCapacity
+    // Do not output expertTokensBeforeCapacity in dropless mode
     expertTokensBeforeCapacityFlag = false;
   }
   moeInitRoutingTilingData.cols = cols;
@@ -235,8 +235,8 @@ bool InnerMoeInitRoutingV2TilingBase::GetPlatformInfo(int64_t aivCoreNum, int64_
 
 
 bool InnerMoeInitRoutingV2TilingBase::GetWorkspaceSize() {
-  // 计算workspace大小
-  size_t sortWorkspaceSize = totalLength * sizeof(float) * NUM_TWO * NUM_THREE;  // 排序需要的空间
+  // Calculate workspace size
+  size_t sortWorkspaceSize = totalLength * sizeof(float) * NUM_TWO * NUM_THREE;  // Space needed for sorting
   size_t scatterWorkspaceSize = totalLength * sizeof(int32_t) * NUM_TWO;
   size_t expertTokenFlagSize = aivNum * 2 * sizeof(int32_t);
   workspaceSize_ = sortWorkspaceSize + scatterWorkspaceSize + expertTokenFlagSize + SIZE_16 * LENGTH_1024 * LENGTH_1024;
@@ -257,11 +257,11 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSOneCoreCompute(InnerMoeV2VBSComp
 
 void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSComputeTilingData* tilingData) {
       //Tiling4VBSMultiCoreCompute
-      int64_t needCoreNum = CeilDiv(totalLength, sortLoopMaxElement);  // 向上取整
+      int64_t needCoreNum = CeilDiv(totalLength, sortLoopMaxElement);  // Round up
       needCoreNum = static_cast<int64_t>(std::pow(4, CeilLog4(needCoreNum)));
-      needCoreNum = std::min(needCoreNum, aivNum);                     // 不能超过物理核数
+      needCoreNum = std::min(needCoreNum, aivNum);                     // Cannot exceed physical core count
       if (needCoreNum > 0) {
-          int64_t perCoreElements = totalLength / needCoreNum;  // 每个核处理的元素数
+          int64_t perCoreElements = totalLength / needCoreNum;  // Elements handled per core
           int64_t alineFloorPerCoreElements = perCoreElements - perCoreElements % SORT32_ALIGN_ELEMENT;
           int64_t lastCoreElement = totalLength - (needCoreNum - 1) * alineFloorPerCoreElements;
           int64_t alineCeilPerCoreElements = perCoreElements + SORT32_ALIGN_ELEMENT - perCoreElements % SORT32_ALIGN_ELEMENT;
@@ -274,7 +274,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSCo
           tilingData->needCoreNum = needCoreNum;
           do {
               tilingData->perCoreElements = perCoreElements;
-              tilingData->perCoreLoops = CeilDiv(tilingData->perCoreElements, sortLoopMaxElement);  // 每个核处理的loop数
+              tilingData->perCoreLoops = CeilDiv(tilingData->perCoreElements, sortLoopMaxElement);  // Loops handled per core
               tilingData->perCorePerLoopElements = std::min(tilingData->perCoreElements, sortLoopMaxElement);
               tilingData->perCoreLastLoopElements = tilingData->perCoreElements - (tilingData->perCoreLoops - 1) * tilingData->perCorePerLoopElements;
               tilingData->lastCoreElements = totalLength - (tilingData->needCoreNum - 1) * tilingData->perCoreElements;
@@ -294,7 +294,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSMultiCoreCompute(InnerMoeV2VBSCo
 void InnerMoeInitRoutingV2TilingBase::Tiling4VBSCompute() {
   auto tilingData = &moeInitRoutingTilingData.vbsComputeParamsOp;
   tilingData->oneLoopMaxElements = sortLoopMaxElement;
-  if (totalLength <= sortLoopMaxElement) {  // 只用到一个核
+  if (totalLength <= sortLoopMaxElement) {  // Only one core is used
     Tiling4VBSOneCoreCompute(tilingData);
     return;
   }
@@ -304,11 +304,11 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4VBSCompute() {
 void InnerMoeInitRoutingV2TilingBase::Tiling4VMSMiddleCompute() {
   auto vbsComputeTilingData = &moeInitRoutingTilingData.vbsComputeParamsOp;
   auto tilingData = &moeInitRoutingTilingData.vmsMiddleComputeParamsOp;
-  if (vbsComputeTilingData->needCoreNum <= MRG_LIST_NUM) {  // 队列数小于一次vms则没有中间归并
-      tilingData->needCoreNum = 0;                               // 需要的核数
+  if (vbsComputeTilingData->needCoreNum <= MRG_LIST_NUM) {  // No intermediate merge if queue count fits one VMS
+      tilingData->needCoreNum = 0;                               // Required core count
   } else {
       int64_t needCoreNum = CeilDiv(vbsComputeTilingData->needCoreNum, MRG_LIST_NUM);
-      tilingData->needCoreNum = needCoreNum;  // 需要的核数
+      tilingData->needCoreNum = needCoreNum;  // Required core count
   }
 }
 
@@ -333,7 +333,7 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4SrcToDstCompute() {
   tilingData->needCoreNum = needCoreNum;
   int64_t lastCoreNum = totalLength - perCoreRows * (tilingData->needCoreNum - 1);
   tilingData->perCoreRows = perCoreRows;
-  if (perLoopMaxRows >= tilingData->perCoreRows) {  // 一个loop结束
+  if (perLoopMaxRows >= tilingData->perCoreRows) {  // One loop completes
       tilingData->perCorePerLoopRows = tilingData->perCoreRows;
       tilingData->perCoreLastLoopRows = tilingData->perCoreRows;
   } else {
@@ -407,4 +407,4 @@ void InnerMoeInitRoutingV2TilingBase::Tiling4SrcToDstCapacityCompute() {
   }
 }
 
-}
\ No newline at end of file
+}
diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h
index fe057656..ac4c1f95 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_expert_token_out.h
@@ -151,7 +151,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCumsum(bool isTai
     return;
   }
 #ifdef __CCE_KT_TEST__
-    // CPU孪生调试无法使用多核同步，可能导致index为未初始化的脏数据，因此需要特殊处理
+    // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially
     if (this->firstExpertId > expertTokensCountOrCumsumGm.GetSize()) {
         return;
     }
@@ -202,7 +202,7 @@ __aicore__ inline void MoeV2ExpertTokenOut::CopyOutExpertTokensCount(bool isTail
   int64_t copyLength = isTail ? this->lastExpertId - this->firstExpertId + 1 : this->expertNumUbAlign;
   DataCopyExtParams copyParams{static_cast<uint16_t>(1), static_cast<uint32_t>(copyLength * sizeof(int32_t)), 0, 0, 0};
 #ifdef __CCE_KT_TEST__
-    // CPU孪生调试不进行输出拷贝
+    // CPU twin debugging skips output copies
     return;
 #endif
   SetAtomicAdd<int32_t>();
@@ -307,4 +307,4 @@ __aicore__ inline void MoeV2ExpertTokenOut::Process() {
 }
 
 }  // namespace MoeInitRoutingQuantV2
-#endif  // INNER_MOE_V2_EXPERT_TOKEN_OUT_H
\ No newline at end of file
+#endif  // INNER_MOE_V2_EXPERT_TOKEN_OUT_H
diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
index 9f3ef220..924e8548 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_dynamic_quant.h
@@ -184,7 +184,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::CopyOutXQuant1H(int64_t progr
 
     inputXInQueue.EnQue<T>(inLocal);
 
-    // 计算quant
+    // Compute quantization
     Compute(smoothLocal);
 
     LocalTensor<float> quantScaleLocal = scaleOutQueue.DeQue<float>();
@@ -525,7 +525,7 @@ template <typename T>
 __aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() {
   if (this->blockIdx < this->needCoreNum) {
     currentLoopRows = perLoopRows;
-    if (colLoops > 1) {  // 一行无法全载，需要workspace
+    if (colLoops > 1) {  // A single row cannot be fully loaded; workspace is required
       if (smoothType == 2) {
         for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
           CopyInExpandedExpertIdx(loop);
@@ -543,7 +543,7 @@ __aicore__ inline void MoeV2GatherDynamicQuant<T>::Process() {
         CopyInExpandedRowIdx(this->rowLoops - 1);
         CopyOutPartialXQuant1H(this->rowLoops - 1);
       }
-    } else {  // 一行可以全载
+    } else {  // A single row can be fully loaded
       if (smoothType == 2) {
         for (int64_t loop = 0; loop < this->rowLoops - 1; loop++) {
           CopyInExpandedExpertIdx(loop);
diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h
index bbdb1338..76427276 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_gather_out.h
@@ -111,7 +111,7 @@ __aicore__ inline void MoeV2GatherOut<T>::CopyOut(int64_t progress) {
         }
         outOffset = outIndex * cols + colsLoop * this->perLoopCols;
 #ifdef __CCE_KT_TEST__
-        // CPU孪生调试无法使用多核同步，可能导致index为未初始化的脏数据，因此需要特殊处理
+        // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially
         if (outOffset > expandedXGm.GetSize()) {
             continue;
         }
diff --git a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h
index 850e66b0..770163fc 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/moe_init_routing_quant_v2/moe_v2_src_to_dst_with_capacity.h
@@ -132,7 +132,7 @@ __aicore__ inline void MoeV2SrcToDstWithCapacity<T, TilingData>::CopyOut(int64_t
             col = this->lastLoopCols;
           }
 #ifdef __CCE_KT_TEST__
-          // CPU孪生调试无法使用多核同步，可能导致index为未初始化的脏数据，因此需要特殊处理
+          // CPU twin debugging cannot use multi-core sync, so index may contain uninitialized dirty data; handle specially
           if (index * this->cols + i * this->perLoopCols + col * sizeof(T) > expandedXGm.GetSize()) {
               continue;
           }
@@ -266,4 +266,4 @@ __aicore__ inline void MoeV2SrcToDstWithCapacity<T, TilingData>::Process() {
   this->SyncAll();
 }
 }  // namespace MoeInitRoutingQuantV2
-#endif  // INNER_MOE_V2_SRC_TO_DST_WITH_CAPACITY_H
\ No newline at end of file
+#endif  // INNER_MOE_V2_SRC_TO_DST_WITH_CAPACITY_H
diff --git a/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h b/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h
index 12b35a29..1255b5cf 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/unpermute/moe_token_unpermute.h
@@ -107,7 +107,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::Init(GM_ADDR permuted_tokens, GM_ADD
     this->tokens_splited_num = tiling_data->tokens_splited_num;
     this->tokens_splited_remain = tiling_data->tokens_splited_remain;
 
-    // 处理token_by_core尾块
+    // Handle the tail block for token_by_core
     if (this->tokens_core_remain > 0 && blockIdx < this->tokens_core_remain) {
         this->tokens_core_length += 1;
         this->tokens_splited_remain += 1;
@@ -181,7 +181,7 @@ __aicore__ inline void KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::Process()
     for (int64_t i = 0; i < this->tokens_splited_num; ++i) {
         CalMultiOutToken(i * this->tokens_splited_length, this->tokens_splited_length);
     }
-    // 处理tokens_num不能均匀分核数的尾块
+    // Handle the tail block when tokens_num is not evenly divisible by core count
     if (this->tokens_splited_remain > 0) {
         CalMultiOutToken(this->tokens_splited_num * this->tokens_splited_length, this->tokens_splited_remain);
     }
@@ -231,7 +231,7 @@ __aicore__ inline void KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalSingleOutT
     for (int64_t h_index = 0; h_index < this->hidden_splited_num; ++h_index) {
         CalPartOutToken(start_token, h_index, this->hidden_splited_length, out_token_idx);
     }
-    // 一次不能完整容纳完整的hidden_size, 处理尾块
+    // Handle the tail block when a full hidden_size does not fit in one pass
     if (this->hidden_splited_remain > 0) {
         CalPartOutToken(start_token, this->hidden_splited_num, this->hidden_splited_remain, out_token_idx);
     }
@@ -248,7 +248,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalPartOutToken(const int64_t start_
     int64_t end_token = start_token + this->top_k;
     T2 cal_token_idx = this->indicesLocal.GetValue(start_token);
 
-    // 处理第一个Token数据
+    // Handle the first token
     if (cal_token_idx < this->num_out_tokens) {
         float probsValue = 0;
         if constexpr (PROBS) {
@@ -263,7 +263,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalPartOutToken(const int64_t start_
         Duplicate(this->token_tensor0, static_cast<float>(0), h_length);
     }
 
-    // 处理剩余的Token数据
+    // Handle the remaining tokens
     for (int64_t token_index = start_token + 1; token_index < end_token; ++token_index) {
         cal_token_idx = this->indicesLocal.GetValue(token_index);
         if (cal_token_idx < this->num_out_tokens) {
@@ -278,7 +278,7 @@ KernelMoeTokenUnpermute<T1, T2, T3, PROBS>::CalPartOutToken(const int64_t start_
         }
     }
 
-    // 输出计算结果
+    // Write out the computed result
     CopyOut(out_token_index, h_index, h_length);
 }
 
diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
index 8bdd017d..4b627a67 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
@@ -146,27 +146,27 @@ public:
             auto gmTileD = gmD[loopIdx * blockN];
             LayoutC layoutUbC{1, blockN};
 
-            // 把C从GM workspace搬到UB
+            // Move C from GM workspace to UB
             AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
             copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
             AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
 
-            //在UB上做把C cast成FP32
+            // Cast C to FP32 in UB
             AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
             AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
             AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
 
-            // 获取pertoken scale值，gmPerTokenScale的第loopIdx行
+            // Get per-token scale from row loopIdx of gmPerTokenScale
             ElementPerTokenScale perTokenScale = gmPerTokenScale(loopIdx);
 
             AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
             AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
-            // pertoken scale值与FP32的C做Muls乘法
+            // Multiply FP32 C by the per-token scale
             AscendC::PipeBarrier<PIPE_V>();
             AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
             AscendC::PipeBarrier<PIPE_V>();
 
-            // 将muls结果转回fp16/bf16
+            // Cast the muls result back to fp16/bf16
             LayoutD layoutUbD{1, blockN};
             AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[ubListId]);
 
diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp
index adca19f6..26630122 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp
@@ -140,7 +140,7 @@ public:
     {
         params = params_;
     }
-    // 每个tile就是1*7168，每个block是一个expert的所有token=[group[i], 7168]
+    // Each tile is 1x7168, and each block covers all tokens for one expert = [group[i], 7168]
     CATLASS_DEVICE
     void operator() (
         AscendC::GlobalTensor<ElementC> const &gmC,
@@ -200,39 +200,39 @@ public:
             auto gmTileD = gmD[loopIdx * ChunkTileLen];
             LayoutC layoutUbC{1, blockN};
 
-            // 把C从GM workspace搬到UB
+            // Move C from GM workspace to UB
             AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
             copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
             AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
 
-            // 在UB上做把C cast成FP32
+            // Cast C to FP32 in UB
             AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
             AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
             AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
 
-            // 获取pertoken scale值，gmPerTokenScale的第loopIdx行
+            // Get per-token scale from row loopIdx of gmPerTokenScale
             ElementPerTokenScale perTokenScale = gmPerTokenScale1(loopIdx);
 
             AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
             AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
-            // pertoken scale值与FP32的C做Muls乘法
+            // Multiply FP32 C by the per-token scale
             AscendC::PipeBarrier<PIPE_V>();
             AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
             AscendC::PipeBarrier<PIPE_V>();
 
-            //swiglue计算过程
+            // Swiglu computation process
             AscendC::Muls(ubCFp32ChunkN, ubCFp32, -1.0f, ChunkTileLen);
             AscendC::PipeBarrier<PIPE_V>();
             AscendC::Exp(ubCFp32ChunkN, ubCFp32ChunkN, ChunkTileLen);
             AscendC::PipeBarrier<PIPE_V>();
             AscendC::Adds(ubCFp32ChunkN, ubCFp32ChunkN, 1.0f, ChunkTileLen);
             AscendC::PipeBarrier<PIPE_V>();
-            //TODO除的时候是否会对之后的数据有影响；
+            // TODO: confirm whether the division impacts subsequent data
             AscendC::Div(ubCFp32ChunkN, ubCFp32, ubCFp32ChunkN, ChunkTileLen);
             AscendC::PipeBarrier<PIPE_V>();
             AscendC::Mul(ubCFp32ChunkN, ubCFp32ChunkN, ubCFp32[ChunkTileLen], ChunkTileLen);
             
-            //quant过程，两种方式区别；
+            // Quantization process; difference between the two approaches
             AscendC::PipeBarrier<PIPE_V>();
             AscendC::Abs(ubAbs, ubCFp32ChunkN, ChunkTileLen);
             AscendC::PipeBarrier<PIPE_V>();
@@ -243,7 +243,7 @@ public:
             AscendC::SetFlag<AscendC::HardEvent::V_S>(0);
             AscendC::WaitFlag<AscendC::HardEvent::V_S>(0);
 
-            //TODO两种计算方法的效率比较
+            // TODO: compare the efficiency of the two calculation methods
             ElementPerTokenScale GMubDequantScale = ubReduceMax.GetValue(0);
             AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
 
diff --git a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
index b66268fb..fd2b995c 100644
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
@@ -56,7 +56,7 @@ FORCE_INLINE_AICORE int32_t gm_signal_wait_until_eq_for_barrier(__gm__ int32_t *
 constexpr int32_t MAX_RANK_SIZE = 32;
 class HcclShmem {
 public:
-    #ifdef HCCL_COMM    // hccl需要初始化hccl context
+    #ifdef HCCL_COMM    // HCCL needs to initialize the HCCL context
     __gm__ HcclOpResParamCustom *WinContext_{nullptr};
     Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
     GM_ADDR m_ptrArray[MAX_RANK_SIZE];
@@ -92,7 +92,7 @@ public:
     #endif
 
     FORCE_INLINE_AICORE
-    GM_ADDR operator() () const {   // 无参数，返回本地peermem
+    GM_ADDR operator() () const {   // No argument: return local peermem
         #ifdef HCCL_COMM
             return m_ptrArray[m_rank];
         #else
@@ -101,7 +101,7 @@ public:
     }
 
     FORCE_INLINE_AICORE
-    GM_ADDR operator() (int32_t index) const {  // 带index参数，返回远端peermem首地址
+    GM_ADDR operator() (int32_t index) const {  // With index: return remote peermem base address
         #ifdef HCCL_COMM
             return m_ptrArray[index];
         #else
@@ -126,22 +126,6 @@ public:
         #endif
     }
 
-        // FORCE_INLINE_AICORE
-    // GM_ADDR operator () (GM_ADDR ptr, int32_t index) const  {   // shmem_ptr相同用法
-    //     #ifdef HCCL_COMM
-    //         size_t offset = ptr - m_ptrArray[m_rank];
-    //         if (offset < 0 || offset >= m_segmentSize) {
-    //             return nullptr;
-    //         }
-    //         if (index < 0 || index >= m_rankSize) {
-    //             return nullptr;
-    //         }
-    //         return m_ptrArray[index] + offset;
-    //     #else
-    //         return shmem_ptr(ptr, index);
-    //     #endif
-    // }
-
 
     FORCE_INLINE_AICORE
     ~HcclShmem() {
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 3f803413..3aa42944 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -157,6 +157,7 @@ cd ..
 # Install vLLM Ascend.
 git clone  --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git
 cd vllm-ascend
+git submodule update --init --recursive
 pip install -v -e .
 cd ..
 ```