[Kernel] add custom moe ops for prefill (#4194)
### What this PR does / why we need it?
1.Add the implementation of normal Aclnn operators: MoeCombineNormal,
MoeDispatchNormal, NotifyDispatch,and DispatchLayout.
- MoeCombineNormal: Implements the combine logic within MoE operations.
- MoeDispatchNormal: Implements the dispatch logic within MoE
operations.
- NotifyDispatch: Exchanges topk_idx information among different ranks
to calculate the device memory required for the dispatch stage.
- DispatchLayout: Used to calculate information related to the device
memory layout for the dispatch stage.
2.Provide PyTorch interfaces for normal operators—get_dispatch_layout,
dispatch_prefill, and combine_prefill—to be used for MoE communication
during the prefill stage in vLLM.
- get_dispatch_layout: Calculates information related to the device
memory layout for the dispatch operator, and is called before
dispatch_prefill.
- dispatch_prefill: Initiates the dispatch operation.
- combine_prefill: Initiates the combine operation.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
The functionality has already been validated using the local Qwen model.
Test cases will be added after support for multi-NPU use cases in the CI
pipeline is finalized.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: shiro-zzzz <zhangdianhao@huawei.com>
This commit is contained in:
17
csrc/dispatch_layout/op_kernel/dispatch_layout.cpp
Normal file
17
csrc/dispatch_layout/op_kernel/dispatch_layout.cpp
Normal file
@@ -0,0 +1,17 @@
|
||||
#include "kernel_operator.h"
|
||||
#include "dispatch_layout.h"
|
||||
#include "dispatch_layout_tiling.h"
|
||||
|
||||
|
||||
extern "C" __global__ __aicore__ void dispatch_layout(GM_ADDR topkIdx, GM_ADDR numTokensPerRank, GM_ADDR numTokensPerExpert,
|
||||
GM_ADDR isTokenInRank, GM_ADDR workspace, GM_ADDR tiling)
|
||||
{
|
||||
REGISTER_TILING_DEFAULT(DispatchLayoutTilingData);
|
||||
GET_TILING_DATA_WITH_STRUCT(DispatchLayoutTilingData, tilingData, tiling);
|
||||
|
||||
TPipe pipe;
|
||||
|
||||
DispatchLayout<int32_t> op;
|
||||
op.Init(topkIdx, numTokensPerRank, numTokensPerExpert, isTokenInRank, workspace, &pipe, &tilingData);
|
||||
op.Process();
|
||||
}
|
||||
153
csrc/dispatch_layout/op_kernel/dispatch_layout.h
Normal file
153
csrc/dispatch_layout/op_kernel/dispatch_layout.h
Normal file
@@ -0,0 +1,153 @@
|
||||
#ifndef DISPATCH_LAYOUT_H
|
||||
#define DISPATCH_LAYOUT_H
|
||||
|
||||
#include <climits>
|
||||
#include "kernel_operator.h"
|
||||
|
||||
#include "../common/comm_args.h"
|
||||
#include "../common/data_copy.h"
|
||||
#include "../common/sync_collectives.h"
|
||||
#include "../common/moe_distribute_base.h"
|
||||
#include "dispatch_layout_tiling.h"
|
||||
|
||||
using namespace AscendC;
|
||||
using namespace Moe;
|
||||
|
||||
constexpr uint32_t UB_32_ALIGN = 32U;
|
||||
constexpr uint32_t AIV_NUM = 48;
|
||||
|
||||
template <AscendC::HardEvent event>
|
||||
__aicore__ inline void SyncFunc()
|
||||
{
|
||||
int32_t eventID = static_cast<int32_t>(GetTPipePtr()->FetchEventID(event));
|
||||
AscendC::SetFlag<event>(eventID);
|
||||
AscendC::WaitFlag<event>(eventID);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class DispatchLayout {
|
||||
|
||||
public:
|
||||
__aicore__ inline DispatchLayout() {};
|
||||
|
||||
__aicore__ inline void Init(GM_ADDR topkIdx, GM_ADDR numTokensPerRank, GM_ADDR numTokensPerExpert, GM_ADDR isTokenInRank,
|
||||
GM_ADDR workspace, TPipe *pipe, const DispatchLayoutTilingData *tilingData)
|
||||
{
|
||||
numTokens_ = tilingData->dispatchLayoutInfo.numTokens;
|
||||
numRanks_ = tilingData->dispatchLayoutInfo.numRanks;
|
||||
numExperts_ = tilingData->dispatchLayoutInfo.numExperts;
|
||||
numTopk_ = tilingData->dispatchLayoutInfo.numTopk;
|
||||
tpipe_ = pipe;
|
||||
|
||||
coreIdx_ = GetBlockIdx();
|
||||
uint32_t temp = numTokens_ / AIV_NUM;
|
||||
uint32_t restNum = numTokens_ % AIV_NUM;
|
||||
int64_t topkIdxOffset;
|
||||
int64_t isTokenOffset;
|
||||
tempTokens_ = temp;
|
||||
if (coreIdx_ < restNum) {
|
||||
tempTokens_++;
|
||||
}
|
||||
topkIdx32AlignIntLen_ = Ceil(tempTokens_ * numTopk_ * sizeof(int64_t), UB_32_ALIGN) * UB_32_ALIGN;
|
||||
numTokensPerRank32AlignIntLen_ = Ceil(numRanks_ * sizeof(T), UB_32_ALIGN) * UB_32_ALIGN;
|
||||
numTokensPerExpert32AlignIntLen_ = Ceil(numExperts_ * sizeof(T), UB_32_ALIGN) * UB_32_ALIGN;
|
||||
isTokenInRank32AlignIntLen_ = Ceil(tempTokens_ * numRanks_ * sizeof(T), UB_32_ALIGN) * UB_32_ALIGN;
|
||||
|
||||
if (coreIdx_ < restNum) {
|
||||
topkIdxOffset = coreIdx_ * topkIdx32AlignIntLen_;
|
||||
isTokenOffset = coreIdx_ * isTokenInRank32AlignIntLen_;
|
||||
} else {
|
||||
topkIdxOffset = restNum * Ceil((tempTokens_ + 1) * numTopk_ * sizeof(int64_t), UB_32_ALIGN) * UB_32_ALIGN
|
||||
+ (coreIdx_ - restNum) * topkIdx32AlignIntLen_;
|
||||
isTokenOffset = restNum * Ceil((tempTokens_ + 1) * numRanks_ * sizeof(T), UB_32_ALIGN) * UB_32_ALIGN
|
||||
+ (coreIdx_ - restNum) * isTokenInRank32AlignIntLen_;
|
||||
}
|
||||
|
||||
topkIdxGM_.SetGlobalBuffer((__gm__ int64_t*)(topkIdx + topkIdxOffset));
|
||||
numTokensPerRankGM_.SetGlobalBuffer((__gm__ T*)numTokensPerRank);
|
||||
numTokensPerExpertGM_.SetGlobalBuffer((__gm__ T*)numTokensPerExpert);
|
||||
isTokenInRankGM_.SetGlobalBuffer((__gm__ T*)(isTokenInRank + isTokenOffset));
|
||||
|
||||
|
||||
}
|
||||
|
||||
__aicore__ inline void Process()
|
||||
{
|
||||
tpipe_->Reset();
|
||||
tpipe_->InitBuffer(topkIdxBuf_, topkIdx32AlignIntLen_);
|
||||
tpipe_->InitBuffer(numTokensPerRankBuf_, numTokensPerRank32AlignIntLen_);
|
||||
tpipe_->InitBuffer(numTokensPerExpertBuf_, numTokensPerExpert32AlignIntLen_);
|
||||
tpipe_->InitBuffer(isTokenInRankBuf_, isTokenInRank32AlignIntLen_);
|
||||
tpipe_->InitBuffer(seenRankBuf_, numRanks_ * sizeof(T));
|
||||
|
||||
LocalTensor<int64_t> topkIdxTensor = topkIdxBuf_.AllocTensor<int64_t>();
|
||||
const DataCopyExtParams dataCopyParams{1U, topkIdx32AlignIntLen_, 0U, 0U, 0U};
|
||||
const DataCopyPadExtParams<int64_t> padParams{false, 0U, 0U, 0U};
|
||||
DataCopyPad(topkIdxTensor, topkIdxGM_, dataCopyParams, padParams);
|
||||
SyncFunc<AscendC::HardEvent::MTE2_S>();
|
||||
|
||||
LocalTensor<T> numTokensPerRankTensor = numTokensPerRankBuf_.AllocTensor<T>();
|
||||
LocalTensor<T> numTokensPerExpertTensor = numTokensPerExpertBuf_.AllocTensor<T>();
|
||||
LocalTensor<T> isTokenInRankTensor = isTokenInRankBuf_.AllocTensor<T>();
|
||||
LocalTensor<T> seenRankTensor = seenRankBuf_.AllocTensor<T>();
|
||||
Duplicate<T>(numTokensPerRankTensor, 0, numRanks_);
|
||||
Duplicate<T>(numTokensPerExpertTensor, 0, numExperts_);
|
||||
Duplicate<T>(isTokenInRankTensor, 0, tempTokens_ * numRanks_);
|
||||
SyncFunc<AscendC::HardEvent::V_S>();
|
||||
|
||||
int experts_per_rank = numExperts_ / numRanks_;
|
||||
for (int i = 0; i < tempTokens_; ++i) {
|
||||
SyncFunc<AscendC::HardEvent::S_V>();
|
||||
Duplicate<T>(seenRankTensor, 0, numRanks_);
|
||||
SyncFunc<AscendC::HardEvent::V_S>();
|
||||
for (int j = 0; j < numTopk_; ++j) {
|
||||
int64_t expert_idx = topkIdxTensor.GetValue(i * numTopk_ + j);
|
||||
uint32_t per_expert_num = numTokensPerExpertTensor.GetValue(expert_idx) + 1;
|
||||
numTokensPerExpertTensor.SetValue(expert_idx, per_expert_num);
|
||||
int rank_id = expert_idx / experts_per_rank;
|
||||
if (!seenRankTensor.GetValue(rank_id)) {
|
||||
uint32_t per_rank_num = numTokensPerRankTensor.GetValue(rank_id) + 1;
|
||||
isTokenInRankTensor.SetValue(i * numRanks_ + rank_id, 1);
|
||||
seenRankTensor.SetValue(rank_id, 1);
|
||||
numTokensPerRankTensor.SetValue(rank_id, per_rank_num);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const DataCopyExtParams isTokenInRankDataCopyParams{1U, isTokenInRank32AlignIntLen_, 0U, 0U, 0U};
|
||||
DataCopyPad(isTokenInRankGM_, isTokenInRankTensor, isTokenInRankDataCopyParams);
|
||||
AscendC::SetAtomicAdd<T>();
|
||||
const DataCopyExtParams numTokensPerRankDataCopyParams{1U, numTokensPerRank32AlignIntLen_, 0U, 0U, 0U};
|
||||
DataCopyPad(numTokensPerRankGM_, numTokensPerRankTensor, numTokensPerRankDataCopyParams);
|
||||
const DataCopyExtParams numTokensPerExpertDataCopyParams{1U, numTokensPerExpert32AlignIntLen_, 0U, 0U, 0U};
|
||||
DataCopyPad(numTokensPerExpertGM_, numTokensPerExpertTensor, numTokensPerExpertDataCopyParams);
|
||||
AscendC::SetAtomicNone();
|
||||
}
|
||||
|
||||
private:
|
||||
GlobalTensor<int64_t> topkIdxGM_;
|
||||
GlobalTensor<T> numTokensPerRankGM_;
|
||||
GlobalTensor<T> numTokensPerExpertGM_;
|
||||
GlobalTensor<T> isTokenInRankGM_;
|
||||
|
||||
TBuf<> topkIdxBuf_;
|
||||
TBuf<> numTokensPerRankBuf_;
|
||||
TBuf<> numTokensPerExpertBuf_;
|
||||
TBuf<> isTokenInRankBuf_;
|
||||
TBuf<> seenRankBuf_;
|
||||
|
||||
TPipe *tpipe_{nullptr};
|
||||
uint32_t numTokens_{0};
|
||||
uint32_t numRanks_{0};
|
||||
uint32_t numExperts_{0};
|
||||
uint32_t numTopk_{0};
|
||||
uint32_t coreIdx_{0};
|
||||
uint32_t tempTokens_{0};
|
||||
|
||||
uint32_t topkIdx32AlignIntLen_{0};
|
||||
uint32_t numTokensPerRank32AlignIntLen_{0};
|
||||
uint32_t numTokensPerExpert32AlignIntLen_{0};
|
||||
uint32_t isTokenInRank32AlignIntLen_{0};
|
||||
};
|
||||
|
||||
#endif // DISPATCH_LAYOUT_H
|
||||
20
csrc/dispatch_layout/op_kernel/dispatch_layout_tiling.h
Normal file
20
csrc/dispatch_layout/op_kernel/dispatch_layout_tiling.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef DISPATCH_LAYOUT_TILING_H
|
||||
#define DISPATCH_LAYOUT_TILING_H
|
||||
|
||||
#include "kernel_tiling/kernel_tiling.h"
|
||||
|
||||
struct DispatchLayoutInfo {
|
||||
uint32_t numTokens;
|
||||
uint32_t numRanks;
|
||||
uint32_t numExperts;
|
||||
uint32_t numTopk;
|
||||
uint64_t totalUbSize;
|
||||
};
|
||||
|
||||
struct DispatchLayoutTilingData {
|
||||
Mc2InitTiling mc2InitTiling;
|
||||
Mc2CcTiling mc2CcTiling1;
|
||||
DispatchLayoutInfo dispatchLayoutInfo;
|
||||
};
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user