[Refactor] Add expert processed token count output for DispatchFFNCombine/DispatchFFNCombineBF16 (#6402)
### What this PR does / why we need it?
Add New Output for Expert Token Count
An additional output tensor expert_token_nums is added to both operators
to meet the requirement of tracking token distribution among experts:
Tensor Name: expert_token_nums
Dimension: 1D tensor
Shape: (local_expert_num,)
Data Type: int32
Semantics: Represents the number of tokens actually received by each
expert on the current card.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: guanguan0308 <1546542263@qq.com>
Signed-off-by: guanguan0308 <162653673+guanguan0308@users.noreply.github.com>
This commit is contained in:
@@ -126,6 +126,7 @@ class TestDisptachFFNCombine:
|
||||
scale2_npu.append(scale2[i].npu())
|
||||
|
||||
out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
|
||||
expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()
|
||||
|
||||
torch.ops._C_ascend.dispatch_ffn_combine(
|
||||
x=x,
|
||||
@@ -138,6 +139,7 @@ class TestDisptachFFNCombine:
|
||||
group=self.hcomm_info,
|
||||
max_output_size=512,
|
||||
out=out,
|
||||
expert_token_nums=expert_token_nums,
|
||||
)
|
||||
return True
|
||||
|
||||
@@ -177,6 +179,7 @@ class TestDisptachFFNCombine:
|
||||
scale2_npu.append(scale2.npu())
|
||||
|
||||
out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
|
||||
expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()
|
||||
|
||||
torch.ops._C_ascend.dispatch_ffn_combine(
|
||||
x=x,
|
||||
@@ -189,6 +192,7 @@ class TestDisptachFFNCombine:
|
||||
group=self.hcomm_info,
|
||||
max_output_size=512,
|
||||
out=out,
|
||||
expert_token_nums=expert_token_nums,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
@@ -126,6 +126,7 @@ class TestDisptachFFNCombine:
|
||||
scale2_npu.append(scale2[i].npu())
|
||||
|
||||
out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
|
||||
expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()
|
||||
|
||||
torch.ops._C_ascend.dispatch_ffn_combine(
|
||||
x=x,
|
||||
@@ -138,6 +139,7 @@ class TestDisptachFFNCombine:
|
||||
group=self.hcomm_info,
|
||||
max_output_size=512,
|
||||
out=out,
|
||||
expert_token_nums=expert_token_nums,
|
||||
)
|
||||
return True
|
||||
|
||||
@@ -177,6 +179,7 @@ class TestDisptachFFNCombine:
|
||||
scale2_npu.append(scale2.npu())
|
||||
|
||||
out = self.generate_random_tensor((m, k), dtype=torch.bfloat16).npu()
|
||||
expert_token_nums = self.generate_random_tensor((1, e), dtype=torch.int32).npu()
|
||||
|
||||
torch.ops._C_ascend.dispatch_ffn_combine(
|
||||
x=x,
|
||||
@@ -189,6 +192,7 @@ class TestDisptachFFNCombine:
|
||||
group=self.hcomm_info,
|
||||
max_output_size=512,
|
||||
out=out,
|
||||
expert_token_nums=expert_token_nums,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
Reference in New Issue
Block a user