Update comment doc (#4731)

### What this PR does / why we need it? Translate remaining Chinese comments in the `dispatch_ffn_combine` code to English and update the installation guide to remind users to initialize submodules when building from source. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: mojave2 <chenchen145@huawei.com> Signed-off-by: Chen Chen <0109chenchen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-05 15:07:31 +08:00
parent b32ef53b3b
commit 7f33838e6e
16 changed files with 100 additions and 112 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_row.hpp
@@ -146,27 +146,27 @@ public:
            auto gmTileD = gmD[loopIdx * blockN];
            LayoutC layoutUbC{1, blockN};

-            // 把C从GM workspace搬到UB
+            // Move C from GM workspace to UB
            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
            copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);

-            //在UB上做把C cast成FP32
+            // Cast C to FP32 in UB
            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
            AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);

-            // 获取pertoken scale值，gmPerTokenScale的第loopIdx行
+            // Get per-token scale from row loopIdx of gmPerTokenScale
            ElementPerTokenScale perTokenScale = gmPerTokenScale(loopIdx);

            AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
            AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
-            // pertoken scale值与FP32的C做Muls乘法
+            // Multiply FP32 C by the per-token scale
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
            AscendC::PipeBarrier<PIPE_V>();

-            // 将muls结果转回fp16/bf16
+            // Cast the muls result back to fp16/bf16
            LayoutD layoutUbD{1, blockN};
            AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[ubListId]);

--- a/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/block_epilogue_pertoken_swiglu.hpp
@@ -140,7 +140,7 @@ public:
    {
        params = params_;
    }
-    // 每个tile就是1*7168，每个block是一个expert的所有token=[group[i], 7168]
+    // Each tile is 1x7168, and each block covers all tokens for one expert = [group[i], 7168]
    CATLASS_DEVICE
    void operator() (
        AscendC::GlobalTensor<ElementC> const &gmC,
@@ -200,39 +200,39 @@ public:
            auto gmTileD = gmD[loopIdx * ChunkTileLen];
            LayoutC layoutUbC{1, blockN};

-            // 把C从GM workspace搬到UB
+            // Move C from GM workspace to UB
            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
            copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);

-            // 在UB上做把C cast成FP32
+            // Cast C to FP32 in UB
            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
            AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);

-            // 获取pertoken scale值，gmPerTokenScale的第loopIdx行
+            // Get per-token scale from row loopIdx of gmPerTokenScale
            ElementPerTokenScale perTokenScale = gmPerTokenScale1(loopIdx);

            AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
            AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
-            // pertoken scale值与FP32的C做Muls乘法
+            // Multiply FP32 C by the per-token scale
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
            AscendC::PipeBarrier<PIPE_V>();

-            //swiglue计算过程
+            // Swiglu computation process
            AscendC::Muls(ubCFp32ChunkN, ubCFp32, -1.0f, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Exp(ubCFp32ChunkN, ubCFp32ChunkN, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Adds(ubCFp32ChunkN, ubCFp32ChunkN, 1.0f, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
-            //TODO除的时候是否会对之后的数据有影响；
+            // TODO: confirm whether the division impacts subsequent data
            AscendC::Div(ubCFp32ChunkN, ubCFp32, ubCFp32ChunkN, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Mul(ubCFp32ChunkN, ubCFp32ChunkN, ubCFp32[ChunkTileLen], ChunkTileLen);
            
-            //quant过程，两种方式区别；
+            // Quantization process; difference between the two approaches
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Abs(ubAbs, ubCFp32ChunkN, ChunkTileLen);
            AscendC::PipeBarrier<PIPE_V>();
@@ -243,7 +243,7 @@ public:
            AscendC::SetFlag<AscendC::HardEvent::V_S>(0);
            AscendC::WaitFlag<AscendC::HardEvent::V_S>(0);

-            //TODO两种计算方法的效率比较
+            // TODO: compare the efficiency of the two calculation methods
            ElementPerTokenScale GMubDequantScale = ubReduceMax.GetValue(0);
            AscendC::SetFlag<AscendC::HardEvent::S_V>(0);

--- a/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
+++ b/csrc/dispatch_ffn_combine/op_kernel/utils/hccl_shmem.hpp
@@ -56,7 +56,7 @@ FORCE_INLINE_AICORE int32_t gm_signal_wait_until_eq_for_barrier(__gm__ int32_t *
 constexpr int32_t MAX_RANK_SIZE = 32;
 class HcclShmem {
 public:
-    #ifdef HCCL_COMM    // hccl需要初始化hccl context
+    #ifdef HCCL_COMM    // HCCL needs to initialize the HCCL context
    __gm__ HcclOpResParamCustom *WinContext_{nullptr};
    Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
    GM_ADDR m_ptrArray[MAX_RANK_SIZE];
@@ -92,7 +92,7 @@ public:
    #endif

    FORCE_INLINE_AICORE
-    GM_ADDR operator() () const {   // 无参数，返回本地peermem
+    GM_ADDR operator() () const {   // No argument: return local peermem
        #ifdef HCCL_COMM
            return m_ptrArray[m_rank];
        #else
@@ -101,7 +101,7 @@ public:
    }

    FORCE_INLINE_AICORE
-    GM_ADDR operator() (int32_t index) const {  // 带index参数，返回远端peermem首地址
+    GM_ADDR operator() (int32_t index) const {  // With index: return remote peermem base address
        #ifdef HCCL_COMM
            return m_ptrArray[index];
        #else
@@ -126,22 +126,6 @@ public:
        #endif
    }

-        // FORCE_INLINE_AICORE
-    // GM_ADDR operator () (GM_ADDR ptr, int32_t index) const  {   // shmem_ptr相同用法
-    //     #ifdef HCCL_COMM
-    //         size_t offset = ptr - m_ptrArray[m_rank];
-    //         if (offset < 0 || offset >= m_segmentSize) {
-    //             return nullptr;
-    //         }
-    //         if (index < 0 || index >= m_rankSize) {
-    //             return nullptr;
-    //         }
-    //         return m_ptrArray[index] + offset;
-    //     #else
-    //         return shmem_ptr(ptr, index);
-    //     #endif
-    // }
-

    FORCE_INLINE_AICORE
    ~HcclShmem() {