[Refactor] Add expert processed token count output for DispatchFFNCombine/DispatchFFNCombineBF16 (#6402)

### What this PR does / why we need it? Add New Output for Expert Token Count An additional output tensor expert_token_nums is added to both operators to meet the requirement of tracking token distribution among experts: Tensor Name: expert_token_nums Dimension: 1D tensor Shape: (local_expert_num,) Data Type: int32 Semantics: Represents the number of tokens actually received by each expert on the current card. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: dc917cceb8 --------- Signed-off-by: guanguan0308 <1546542263@qq.com> Signed-off-by: guanguan0308 <162653673+guanguan0308@users.noreply.github.com>
2026-02-03 10:41:06 +08:00
parent 26b83f8bde
commit dffac6db73
18 changed files with 97 additions and 84 deletions
--- a/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine.py
+++ b/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine.py
@@ -126,6 +126,7 @@ class TestDisptachFFNCombine:
            scale2_npu.append(scale2[i].npu())

        out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
+        expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()

        torch.ops._C_ascend.dispatch_ffn_combine(
            x=x,
@@ -138,6 +139,7 @@ class TestDisptachFFNCombine:
            group=self.hcomm_info,
            max_output_size=512,
            out=out,
+            expert_token_nums=expert_token_nums,
        )
        return True

@@ -177,6 +179,7 @@ class TestDisptachFFNCombine:
        scale2_npu.append(scale2.npu())

        out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
+        expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()

        torch.ops._C_ascend.dispatch_ffn_combine(
            x=x,
@@ -189,6 +192,7 @@ class TestDisptachFFNCombine:
            group=self.hcomm_info,
            max_output_size=512,
            out=out,
+            expert_token_nums=expert_token_nums,
        )
        return True

--- a/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine_bf16.py
+++ b/tests/e2e/nightly/single_node/ops/multicard_ops_a3/test_dispatch_ffn_combine_bf16.py
@@ -126,6 +126,7 @@ class TestDisptachFFNCombine:
            scale2_npu.append(scale2[i].npu())

        out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
+        expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()

        torch.ops._C_ascend.dispatch_ffn_combine(
            x=x,
@@ -138,6 +139,7 @@ class TestDisptachFFNCombine:
            group=self.hcomm_info,
            max_output_size=512,
            out=out,
+            expert_token_nums=expert_token_nums,
        )
        return True

@@ -177,6 +179,7 @@ class TestDisptachFFNCombine:
        scale2_npu.append(scale2.npu())

        out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
+        expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()

        torch.ops._C_ascend.dispatch_ffn_combine(
            x=x,
@@ -189,6 +192,7 @@ class TestDisptachFFNCombine:
            group=self.hcomm_info,
            max_output_size=512,
            out=out,
+            expert_token_nums=expert_token_nums,
        )
        return True