[Refactor] Add expert processed token count output for DispatchFFNCombine/DispatchFFNCombineBF16 (#6402)
### What this PR does / why we need it?
Add New Output for Expert Token Count
An additional output tensor expert_token_nums is added to both operators
to meet the requirement of tracking token distribution among experts:
Tensor Name: expert_token_nums
Dimension: 1D tensor
Shape: (local_expert_num,)
Data Type: int32
Semantics: Represents the number of tokens actually received by each
expert on the current card.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: guanguan0308 <1546542263@qq.com>
Signed-off-by: guanguan0308 <162653673+guanguan0308@users.noreply.github.com>
This commit is contained in:
@@ -55,7 +55,7 @@ class DispatchFFNCombine {
|
||||
public:
|
||||
__aicore__ inline DispatchFFNCombine() {};
|
||||
__aicore__ inline void Init(GM_ADDR xGM, GM_ADDR weight1GM, GM_ADDR weight2GM, GM_ADDR expertIdGM, GM_ADDR scale1GM, GM_ADDR scale2GM,
|
||||
GM_ADDR probs, GM_ADDR outGM, GM_ADDR workspaceGM, GM_ADDR tilingGM);
|
||||
GM_ADDR probs, GM_ADDR outGM, GM_ADDR expertTokenNums, GM_ADDR workspaceGM, GM_ADDR tilingGM);
|
||||
__aicore__ inline void Process();
|
||||
|
||||
|
||||
@@ -68,6 +68,7 @@ private:
|
||||
GM_ADDR scale2GM_;
|
||||
GM_ADDR probs_;
|
||||
GM_ADDR outGM_;
|
||||
GM_ADDR gmExpertTokenNums_;
|
||||
GM_ADDR workspaceGM_;
|
||||
|
||||
GM_ADDR moeInitRoutingQuantV2Scale = nullptr;
|
||||
@@ -112,7 +113,7 @@ private:
|
||||
|
||||
template <TemplateMMA2AClass>
|
||||
__aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Init(GM_ADDR xGM, GM_ADDR weight1GM, GM_ADDR weight2GM, GM_ADDR expertIdGM, GM_ADDR scale1GM, GM_ADDR scale2GM,
|
||||
GM_ADDR probs, GM_ADDR outGM, GM_ADDR workspaceGM, GM_ADDR tilingGM)
|
||||
GM_ADDR probs, GM_ADDR outGM, GM_ADDR expertTokenNums, GM_ADDR workspaceGM, GM_ADDR tilingGM)
|
||||
{
|
||||
REGISTER_TILING_DEFAULT(DispatchFFNCombineTilingData);
|
||||
auto tiling = (__gm__ DispatchFFNCombineTilingData*)tilingGM;
|
||||
@@ -127,6 +128,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Init(GM_ADDR xGM,
|
||||
probs_ = probs;
|
||||
|
||||
outGM_ = outGM;
|
||||
gmExpertTokenNums_ = expertTokenNums;
|
||||
|
||||
workspaceGM_ = workspaceGM;
|
||||
|
||||
@@ -268,7 +270,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
|
||||
outGM_, layoutD1, layoutD2,
|
||||
expertIdGM_, moeInitRoutingQuantV2Scale, moeInitRoutingQuantV2Offset,
|
||||
expertTokensBeforeCapacity, probs_,
|
||||
workspaceGM_, ubMoveNum, moeInitRoutingQuantV2TilingData};
|
||||
workspaceGM_, gmExpertTokenNums_, ubMoveNum, moeInitRoutingQuantV2TilingData};
|
||||
//Call kernel
|
||||
MatmulKernel kernel(params);
|
||||
kernel(params);
|
||||
|
||||
Reference in New Issue
Block a user