[Refactor] Add expert processed token count output for DispatchFFNCombine/DispatchFFNCombineBF16 (#6402)

### What this PR does / why we need it? Add New Output for Expert Token Count An additional output tensor expert_token_nums is added to both operators to meet the requirement of tracking token distribution among experts: Tensor Name: expert_token_nums Dimension: 1D tensor Shape: (local_expert_num,) Data Type: int32 Semantics: Represents the number of tokens actually received by each expert on the current card. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: dc917cceb8 --------- Signed-off-by: guanguan0308 <1546542263@qq.com> Signed-off-by: guanguan0308 <162653673+guanguan0308@users.noreply.github.com>
2026-02-03 10:41:06 +08:00
parent 26b83f8bde
commit dffac6db73
18 changed files with 97 additions and 84 deletions
--- a/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
+++ b/csrc/dispatch_ffn_combine/op_kernel/dispatch_ffn_combine.h
@@ -55,7 +55,7 @@ class DispatchFFNCombine {
 public:
    __aicore__ inline DispatchFFNCombine() {};
    __aicore__ inline void Init(GM_ADDR xGM, GM_ADDR weight1GM, GM_ADDR weight2GM, GM_ADDR expertIdGM, GM_ADDR scale1GM, GM_ADDR scale2GM,
-                                GM_ADDR probs, GM_ADDR outGM, GM_ADDR workspaceGM, GM_ADDR tilingGM);
+                                GM_ADDR probs, GM_ADDR outGM, GM_ADDR expertTokenNums, GM_ADDR workspaceGM, GM_ADDR tilingGM);
    __aicore__ inline void Process();


@@ -68,6 +68,7 @@ private:
    GM_ADDR scale2GM_;
    GM_ADDR probs_;
    GM_ADDR outGM_;
+    GM_ADDR gmExpertTokenNums_;
    GM_ADDR workspaceGM_;

    GM_ADDR moeInitRoutingQuantV2Scale = nullptr;
@@ -112,7 +113,7 @@ private:

 template <TemplateMMA2AClass>
 __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Init(GM_ADDR xGM, GM_ADDR weight1GM, GM_ADDR weight2GM, GM_ADDR expertIdGM, GM_ADDR scale1GM, GM_ADDR scale2GM,
-                                                                    GM_ADDR probs, GM_ADDR outGM, GM_ADDR workspaceGM, GM_ADDR tilingGM)
+                                                                    GM_ADDR probs, GM_ADDR outGM, GM_ADDR expertTokenNums, GM_ADDR workspaceGM, GM_ADDR tilingGM)
 {
    REGISTER_TILING_DEFAULT(DispatchFFNCombineTilingData);
    auto tiling = (__gm__ DispatchFFNCombineTilingData*)tilingGM;
@@ -127,6 +128,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Init(GM_ADDR xGM,
    probs_ = probs;

    outGM_ = outGM;
+    gmExpertTokenNums_ = expertTokenNums;

    workspaceGM_ = workspaceGM;

@@ -268,7 +270,7 @@ __aicore__ inline void DispatchFFNCombine<TemplateMMA2ACFunc>::Process()
        outGM_, layoutD1, layoutD2,
        expertIdGM_, moeInitRoutingQuantV2Scale, moeInitRoutingQuantV2Offset,
        expertTokensBeforeCapacity, probs_,
-        workspaceGM_, ubMoveNum, moeInitRoutingQuantV2TilingData};
+        workspaceGM_, gmExpertTokenNums_, ubMoveNum, moeInitRoutingQuantV2TilingData};
    //Call kernel
    MatmulKernel kernel(params);
    kernel(params);