Update comment doc (#4731)

### What this PR does / why we need it?

Translate remaining Chinese comments in the `dispatch_ffn_combine` code
to English and update the installation guide to remind users to
initialize submodules when building from source.

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: mojave2 <chenchen145@huawei.com>
Signed-off-by: Chen Chen <0109chenchen@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
Chen Chen
2025-12-05 15:07:31 +08:00
committed by GitHub
parent b32ef53b3b
commit 7f33838e6e
16 changed files with 100 additions and 112 deletions

View File

@@ -146,27 +146,27 @@ public:
auto gmTileD = gmD[loopIdx * blockN];
LayoutC layoutUbC{1, blockN};
// 把C从GM workspace搬到UB
// Move C from GM workspace to UB
AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
//在UB上做把C cast成FP32
// Cast C to FP32 in UB
AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
// 获取pertoken scale值,gmPerTokenScale的第loopIdx行
// Get per-token scale from row loopIdx of gmPerTokenScale
ElementPerTokenScale perTokenScale = gmPerTokenScale(loopIdx);
AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
// pertoken scale值与FP32的C做Muls乘法
// Multiply FP32 C by the per-token scale
AscendC::PipeBarrier<PIPE_V>();
AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
AscendC::PipeBarrier<PIPE_V>();
// 将muls结果转回fp16/bf16
// Cast the muls result back to fp16/bf16
LayoutD layoutUbD{1, blockN};
AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[ubListId]);

View File

@@ -140,7 +140,7 @@ public:
{
params = params_;
}
// 每个tile就是1*7168每个block是一个expert的所有token=[group[i], 7168]
// Each tile is 1x7168, and each block covers all tokens for one expert = [group[i], 7168]
CATLASS_DEVICE
void operator() (
AscendC::GlobalTensor<ElementC> const &gmC,
@@ -200,39 +200,39 @@ public:
auto gmTileD = gmD[loopIdx * ChunkTileLen];
LayoutC layoutUbC{1, blockN};
// 把C从GM workspace搬到UB
// Move C from GM workspace to UB
AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
// 在UB上做把C cast成FP32
// Cast C to FP32 in UB
AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
// 获取pertoken scale值,gmPerTokenScale的第loopIdx行
// Get per-token scale from row loopIdx of gmPerTokenScale
ElementPerTokenScale perTokenScale = gmPerTokenScale1(loopIdx);
AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
// pertoken scale值与FP32的C做Muls乘法
// Multiply FP32 C by the per-token scale
AscendC::PipeBarrier<PIPE_V>();
AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
AscendC::PipeBarrier<PIPE_V>();
//swiglue计算过程
// Swiglu computation process
AscendC::Muls(ubCFp32ChunkN, ubCFp32, -1.0f, ChunkTileLen);
AscendC::PipeBarrier<PIPE_V>();
AscendC::Exp(ubCFp32ChunkN, ubCFp32ChunkN, ChunkTileLen);
AscendC::PipeBarrier<PIPE_V>();
AscendC::Adds(ubCFp32ChunkN, ubCFp32ChunkN, 1.0f, ChunkTileLen);
AscendC::PipeBarrier<PIPE_V>();
//TODO除的时候是否会对之后的数据有影响;
// TODO: confirm whether the division impacts subsequent data
AscendC::Div(ubCFp32ChunkN, ubCFp32, ubCFp32ChunkN, ChunkTileLen);
AscendC::PipeBarrier<PIPE_V>();
AscendC::Mul(ubCFp32ChunkN, ubCFp32ChunkN, ubCFp32[ChunkTileLen], ChunkTileLen);
//quant过程,两种方式区别;
// Quantization process; difference between the two approaches
AscendC::PipeBarrier<PIPE_V>();
AscendC::Abs(ubAbs, ubCFp32ChunkN, ChunkTileLen);
AscendC::PipeBarrier<PIPE_V>();
@@ -243,7 +243,7 @@ public:
AscendC::SetFlag<AscendC::HardEvent::V_S>(0);
AscendC::WaitFlag<AscendC::HardEvent::V_S>(0);
//TODO两种计算方法的效率比较
// TODO: compare the efficiency of the two calculation methods
ElementPerTokenScale GMubDequantScale = ubReduceMax.GetValue(0);
AscendC::SetFlag<AscendC::HardEvent::S_V>(0);

View File

@@ -56,7 +56,7 @@ FORCE_INLINE_AICORE int32_t gm_signal_wait_until_eq_for_barrier(__gm__ int32_t *
constexpr int32_t MAX_RANK_SIZE = 32;
class HcclShmem {
public:
#ifdef HCCL_COMM // hccl需要初始化hccl context
#ifdef HCCL_COMM // HCCL needs to initialize the HCCL context
__gm__ HcclOpResParamCustom *WinContext_{nullptr};
Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
GM_ADDR m_ptrArray[MAX_RANK_SIZE];
@@ -92,7 +92,7 @@ public:
#endif
FORCE_INLINE_AICORE
GM_ADDR operator() () const { // 无参数,返回本地peermem
GM_ADDR operator() () const { // No argument: return local peermem
#ifdef HCCL_COMM
return m_ptrArray[m_rank];
#else
@@ -101,7 +101,7 @@ public:
}
FORCE_INLINE_AICORE
GM_ADDR operator() (int32_t index) const { // 带index参数返回远端peermem首地址
GM_ADDR operator() (int32_t index) const { // With index: return remote peermem base address
#ifdef HCCL_COMM
return m_ptrArray[index];
#else
@@ -126,22 +126,6 @@ public:
#endif
}
// FORCE_INLINE_AICORE
// GM_ADDR operator () (GM_ADDR ptr, int32_t index) const { // shmem_ptr相同用法
// #ifdef HCCL_COMM
// size_t offset = ptr - m_ptrArray[m_rank];
// if (offset < 0 || offset >= m_segmentSize) {
// return nullptr;
// }
// if (index < 0 || index >= m_rankSize) {
// return nullptr;
// }
// return m_ptrArray[index] + offset;
// #else
// return shmem_ptr(ptr, index);
// #endif
// }
FORCE_INLINE_AICORE
~HcclShmem() {