Update comment doc (#4731)
### What this PR does / why we need it?
Translate remaining Chinese comments in the `dispatch_ffn_combine` code
to English and update the installation guide to remind users to
initialize submodules when building from source.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: mojave2 <chenchen145@huawei.com>
Signed-off-by: Chen Chen <0109chenchen@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -146,27 +146,27 @@ public:
|
||||
auto gmTileD = gmD[loopIdx * blockN];
|
||||
LayoutC layoutUbC{1, blockN};
|
||||
|
||||
// 把C从GM workspace搬到UB
|
||||
// Move C from GM workspace to UB
|
||||
AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
|
||||
copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
|
||||
AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
|
||||
|
||||
//在UB上做把C cast成FP32
|
||||
// Cast C to FP32 in UB
|
||||
AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
|
||||
AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
|
||||
|
||||
// 获取pertoken scale值,gmPerTokenScale的第loopIdx行
|
||||
// Get per-token scale from row loopIdx of gmPerTokenScale
|
||||
ElementPerTokenScale perTokenScale = gmPerTokenScale(loopIdx);
|
||||
|
||||
AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
|
||||
AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
|
||||
// pertoken scale值与FP32的C做Muls乘法
|
||||
// Multiply FP32 C by the per-token scale
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
// 将muls结果转回fp16/bf16
|
||||
// Cast the muls result back to fp16/bf16
|
||||
LayoutD layoutUbD{1, blockN};
|
||||
AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(eventUbDMTE3VList[ubListId]);
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ public:
|
||||
{
|
||||
params = params_;
|
||||
}
|
||||
// 每个tile就是1*7168,每个block是一个expert的所有token=[group[i], 7168]
|
||||
// Each tile is 1x7168, and each block covers all tokens for one expert = [group[i], 7168]
|
||||
CATLASS_DEVICE
|
||||
void operator() (
|
||||
AscendC::GlobalTensor<ElementC> const &gmC,
|
||||
@@ -200,39 +200,39 @@ public:
|
||||
auto gmTileD = gmD[loopIdx * ChunkTileLen];
|
||||
LayoutC layoutUbC{1, blockN};
|
||||
|
||||
// 把C从GM workspace搬到UB
|
||||
// Move C from GM workspace to UB
|
||||
AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
|
||||
copyGmToUbC(ubC, gmTileC, layoutUbC, layoutUbC);
|
||||
AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
|
||||
|
||||
// 在UB上做把C cast成FP32
|
||||
// Cast C to FP32 in UB
|
||||
AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventUbCMTE2VList[ubListId]);
|
||||
AscendC::Cast(ubCFp32, ubC, AscendC::RoundMode::CAST_NONE, blockN);
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(eventUbCVMTE2List[ubListId]);
|
||||
|
||||
// 获取pertoken scale值,gmPerTokenScale的第loopIdx行
|
||||
// Get per-token scale from row loopIdx of gmPerTokenScale
|
||||
ElementPerTokenScale perTokenScale = gmPerTokenScale1(loopIdx);
|
||||
|
||||
AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
|
||||
AscendC::WaitFlag<AscendC::HardEvent::S_V>(0);
|
||||
// pertoken scale值与FP32的C做Muls乘法
|
||||
// Multiply FP32 C by the per-token scale
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
AscendC::Muls(ubCFp32, ubCFp32, perTokenScale, blockN);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
|
||||
//swiglue计算过程
|
||||
// Swiglu computation process
|
||||
AscendC::Muls(ubCFp32ChunkN, ubCFp32, -1.0f, ChunkTileLen);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
AscendC::Exp(ubCFp32ChunkN, ubCFp32ChunkN, ChunkTileLen);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
AscendC::Adds(ubCFp32ChunkN, ubCFp32ChunkN, 1.0f, ChunkTileLen);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
//TODO除的时候是否会对之后的数据有影响;
|
||||
// TODO: confirm whether the division impacts subsequent data
|
||||
AscendC::Div(ubCFp32ChunkN, ubCFp32, ubCFp32ChunkN, ChunkTileLen);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
AscendC::Mul(ubCFp32ChunkN, ubCFp32ChunkN, ubCFp32[ChunkTileLen], ChunkTileLen);
|
||||
|
||||
//quant过程,两种方式区别;
|
||||
// Quantization process; difference between the two approaches
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
AscendC::Abs(ubAbs, ubCFp32ChunkN, ChunkTileLen);
|
||||
AscendC::PipeBarrier<PIPE_V>();
|
||||
@@ -243,7 +243,7 @@ public:
|
||||
AscendC::SetFlag<AscendC::HardEvent::V_S>(0);
|
||||
AscendC::WaitFlag<AscendC::HardEvent::V_S>(0);
|
||||
|
||||
//TODO两种计算方法的效率比较
|
||||
// TODO: compare the efficiency of the two calculation methods
|
||||
ElementPerTokenScale GMubDequantScale = ubReduceMax.GetValue(0);
|
||||
AscendC::SetFlag<AscendC::HardEvent::S_V>(0);
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ FORCE_INLINE_AICORE int32_t gm_signal_wait_until_eq_for_barrier(__gm__ int32_t *
|
||||
constexpr int32_t MAX_RANK_SIZE = 32;
|
||||
class HcclShmem {
|
||||
public:
|
||||
#ifdef HCCL_COMM // hccl需要初始化hccl context
|
||||
#ifdef HCCL_COMM // HCCL needs to initialize the HCCL context
|
||||
__gm__ HcclOpResParamCustom *WinContext_{nullptr};
|
||||
Hccl<HCCL_SERVER_TYPE_AICPU> hccl_;
|
||||
GM_ADDR m_ptrArray[MAX_RANK_SIZE];
|
||||
@@ -92,7 +92,7 @@ public:
|
||||
#endif
|
||||
|
||||
FORCE_INLINE_AICORE
|
||||
GM_ADDR operator() () const { // 无参数,返回本地peermem
|
||||
GM_ADDR operator() () const { // No argument: return local peermem
|
||||
#ifdef HCCL_COMM
|
||||
return m_ptrArray[m_rank];
|
||||
#else
|
||||
@@ -101,7 +101,7 @@ public:
|
||||
}
|
||||
|
||||
FORCE_INLINE_AICORE
|
||||
GM_ADDR operator() (int32_t index) const { // 带index参数,返回远端peermem首地址
|
||||
GM_ADDR operator() (int32_t index) const { // With index: return remote peermem base address
|
||||
#ifdef HCCL_COMM
|
||||
return m_ptrArray[index];
|
||||
#else
|
||||
@@ -126,22 +126,6 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
// FORCE_INLINE_AICORE
|
||||
// GM_ADDR operator () (GM_ADDR ptr, int32_t index) const { // shmem_ptr相同用法
|
||||
// #ifdef HCCL_COMM
|
||||
// size_t offset = ptr - m_ptrArray[m_rank];
|
||||
// if (offset < 0 || offset >= m_segmentSize) {
|
||||
// return nullptr;
|
||||
// }
|
||||
// if (index < 0 || index >= m_rankSize) {
|
||||
// return nullptr;
|
||||
// }
|
||||
// return m_ptrArray[index] + offset;
|
||||
// #else
|
||||
// return shmem_ptr(ptr, index);
|
||||
// #endif
|
||||
// }
|
||||
|
||||
|
||||
FORCE_INLINE_AICORE
|
||||
~HcclShmem() {
|
||||
|
||||
Reference in New Issue
Block a user