[Refactor] Add expert processed token count output for DispatchFFNCombine/DispatchFFNCombineBF16 (#6402)

### What this PR does / why we need it?
Add New Output for Expert Token Count
An additional output tensor expert_token_nums is added to both operators
to meet the requirement of tracking token distribution among experts:

Tensor Name: expert_token_nums
Dimension: 1D tensor
Shape: (local_expert_num,)
Data Type: int32
Semantics: Represents the number of tokens actually received by each
expert on the current card.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.14.1
- vLLM main:
dc917cceb8

---------

Signed-off-by: guanguan0308 <1546542263@qq.com>
Signed-off-by: guanguan0308 <162653673+guanguan0308@users.noreply.github.com>
This commit is contained in:
guanguan0308
2026-02-03 10:41:06 +08:00
committed by GitHub
parent 26b83f8bde
commit dffac6db73
18 changed files with 97 additions and 84 deletions

View File

@@ -55,7 +55,7 @@ class DispatchFFNCombine {
public:
__aicore__ inline DispatchFFNCombine() {};
__aicore__ inline void Init(GM_ADDR xGM, GM_ADDR weight1GM, GM_ADDR weight2GM, GM_ADDR expertIdGM, GM_ADDR scale1GM, GM_ADDR scale2GM,
GM_ADDR probs, GM_ADDR outGM, GM_ADDR workspaceGM, GM_ADDR tilingGM);
GM_ADDR probs, GM_ADDR outGM, GM_ADDR expertTokenNums, GM_ADDR workspaceGM, GM_ADDR tilingGM);
__aicore__ inline void Process();
@@ -68,6 +68,7 @@ private:
GM_ADDR scale2GM_;
GM_ADDR probs_;
GM_ADDR outGM_;
GM_ADDR gmExpertTokenNums_;
GM_ADDR workspaceGM_;
GM_ADDR moeInitRoutingQuantV2Scale = nullptr;
@@ -112,7 +113,7 @@ private:
template <TemplateMMA2AClass>
__aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Init(GM_ADDR xGM, GM_ADDR weight1GM, GM_ADDR weight2GM, GM_ADDR expertIdGM, GM_ADDR scale1GM, GM_ADDR scale2GM,
GM_ADDR probs, GM_ADDR outGM, GM_ADDR workspaceGM, GM_ADDR tilingGM)
GM_ADDR probs, GM_ADDR outGM, GM_ADDR expertTokenNums, GM_ADDR workspaceGM, GM_ADDR tilingGM)
{
REGISTER_TILING_DEFAULT(DispatchFFNCombineTilingData);
auto tiling = (__gm__ DispatchFFNCombineTilingData*)tilingGM;
@@ -127,6 +128,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Init(GM_ADDR xGM,
probs_ = probs;
outGM_ = outGM;
gmExpertTokenNums_ = expertTokenNums;
workspaceGM_ = workspaceGM;
@@ -268,7 +270,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
outGM_, layoutD1, layoutD2,
expertIdGM_, moeInitRoutingQuantV2Scale, moeInitRoutingQuantV2Offset,
expertTokensBeforeCapacity, probs_,
workspaceGM_, ubMoveNum, moeInitRoutingQuantV2TilingData};
workspaceGM_, gmExpertTokenNums_, ubMoveNum, moeInitRoutingQuantV2TilingData};
//Call kernel
MatmulKernel kernel(params);
kernel(params);