[Refactor] Add expert processed token count output for DispatchFFNCombine/DispatchFFNCombineBF16 (#6402)

### What this PR does / why we need it? Add New Output for Expert Token Count An additional output tensor expert_token_nums is added to both operators to meet the requirement of tracking token distribution among experts: Tensor Name: expert_token_nums Dimension: 1D tensor Shape: (local_expert_num,) Data Type: int32 Semantics: Represents the number of tokens actually received by each expert on the current card. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: dc917cceb8 --------- Signed-off-by: guanguan0308 <1546542263@qq.com> Signed-off-by: guanguan0308 <162653673+guanguan0308@users.noreply.github.com>
2026-02-03 10:41:06 +08:00
parent 26b83f8bde
commit dffac6db73
18 changed files with 97 additions and 84 deletions
--- a/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.cpp
+++ b/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.cpp
@@ -47,7 +47,7 @@ extern aclnnStatus aclnnInnerDispatchFFNCombineGetWorkspaceSize(const aclTensor*
                                                         const aclTensor* probs,
                                                         const char* group, int64_t maxOutputSize,
                                                         bool transB, bool weightNz,
-                                                         const aclTensor* out,
+                                                         const aclTensor* out, const aclTensor* expertTokenNums,
                                                         uint64_t* workspaceSize, aclOpExecutor** executor);
 extern aclnnStatus aclnnInnerDispatchFFNCombine(void *workspace, uint64_t workspaceSize,
                                            aclOpExecutor *executor, aclrtStream stream);
@@ -59,7 +59,7 @@ aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const ac
                                                    const aclTensor* expertId, const aclTensorList* scale1, const aclTensorList* scale2,
                                                    const aclTensor* probs,
                                                    const char* group, int64_t maxOutputSize,
-                                                    const aclTensor* out,
+                                                    const aclTensor* out, const aclTensor* expertTokenNums,
                                                    uint64_t* workspaceSize, aclOpExecutor** executor)
 {
    bool transB = false;
@@ -67,7 +67,7 @@ aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const ac

    aclnnStatus ret = aclnnInnerDispatchFFNCombineGetWorkspaceSize(x, weight1, weight2, expertId, scale1, scale2, probs, group, 
                                                                    maxOutputSize, transB, weightNz,
-                                                                    out, workspaceSize, executor);
+                                                                    out, expertTokenNums, workspaceSize, executor);
    return ret;
 }

--- a/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
+++ b/csrc/dispatch_ffn_combine/op_host/aclnn_dispatch_ffn_combine.h
@@ -43,7 +43,7 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWor
                                                                                        const aclTensor* expertId, const aclTensorList* scale1, const aclTensorList* scale2,
                                                                                        const aclTensor* probs,
                                                                                        const char* group, int64_t maxOutputSize,
-                                                                                        const aclTensor* out,
+                                                                                        const aclTensor* out, const aclTensor* expertTokenNums,
                                                                                        uint64_t* workspaceSize, aclOpExecutor** executor);

 /**
--- a/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
+++ b/csrc/dispatch_ffn_combine/op_host/dispatch_ffn_combine_def.cpp
@@ -62,6 +62,11 @@ class DispatchFFNCombine : public OpDef {
        .DataType({ge::DT_FLOAT16, ge::DT_BF16, ge::DT_BF16})
        .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
        .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND,ge::FORMAT_ND});
+    this->Output("expert_token_nums")
+        .ParamType(REQUIRED)
+        .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+        .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+        .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});

    this->Attr("group").AttrType(REQUIRED).String();
    this->Attr("M").AttrType(OPTIONAL).Int();