[Refactor] Add expert processed token count output for DispatchFFNCombine/DispatchFFNCombineBF16 (#6402)
### What this PR does / why we need it?
Add New Output for Expert Token Count
An additional output tensor expert_token_nums is added to both operators
to meet the requirement of tracking token distribution among experts:
Tensor Name: expert_token_nums
Dimension: 1D tensor
Shape: (local_expert_num,)
Data Type: int32
Semantics: Represents the number of tokens actually received by each
expert on the current card.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
---------
Signed-off-by: guanguan0308 <1546542263@qq.com>
Signed-off-by: guanguan0308 <162653673+guanguan0308@users.noreply.github.com>
This commit is contained in:
@@ -47,7 +47,7 @@ extern aclnnStatus aclnnInnerDispatchFFNCombineGetWorkspaceSize(const aclTensor*
|
||||
const aclTensor* probs,
|
||||
const char* group, int64_t maxOutputSize,
|
||||
bool transB, bool weightNz,
|
||||
const aclTensor* out,
|
||||
const aclTensor* out, const aclTensor* expertTokenNums,
|
||||
uint64_t* workspaceSize, aclOpExecutor** executor);
|
||||
extern aclnnStatus aclnnInnerDispatchFFNCombine(void *workspace, uint64_t workspaceSize,
|
||||
aclOpExecutor *executor, aclrtStream stream);
|
||||
@@ -59,7 +59,7 @@ aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const ac
|
||||
const aclTensor* expertId, const aclTensorList* scale1, const aclTensorList* scale2,
|
||||
const aclTensor* probs,
|
||||
const char* group, int64_t maxOutputSize,
|
||||
const aclTensor* out,
|
||||
const aclTensor* out, const aclTensor* expertTokenNums,
|
||||
uint64_t* workspaceSize, aclOpExecutor** executor)
|
||||
{
|
||||
bool transB = false;
|
||||
@@ -67,7 +67,7 @@ aclnnStatus aclnnDispatchFFNCombineGetWorkspaceSize(const aclTensor* x, const ac
|
||||
|
||||
aclnnStatus ret = aclnnInnerDispatchFFNCombineGetWorkspaceSize(x, weight1, weight2, expertId, scale1, scale2, probs, group,
|
||||
maxOutputSize, transB, weightNz,
|
||||
out, workspaceSize, executor);
|
||||
out, expertTokenNums, workspaceSize, executor);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ __attribute__((visibility("default"))) aclnnStatus aclnnDispatchFFNCombineGetWor
|
||||
const aclTensor* expertId, const aclTensorList* scale1, const aclTensorList* scale2,
|
||||
const aclTensor* probs,
|
||||
const char* group, int64_t maxOutputSize,
|
||||
const aclTensor* out,
|
||||
const aclTensor* out, const aclTensor* expertTokenNums,
|
||||
uint64_t* workspaceSize, aclOpExecutor** executor);
|
||||
|
||||
/**
|
||||
|
||||
@@ -62,6 +62,11 @@ class DispatchFFNCombine : public OpDef {
|
||||
.DataType({ge::DT_FLOAT16, ge::DT_BF16, ge::DT_BF16})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND,ge::FORMAT_ND});
|
||||
this->Output("expert_token_nums")
|
||||
.ParamType(REQUIRED)
|
||||
.DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
|
||||
.Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
|
||||
.UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
|
||||
|
||||
this->Attr("group").AttrType(REQUIRED).String();
|
||||
this->Attr("M").AttrType(OPTIONAL).Int();
|
||||
|
||||
Reference in New Issue
Block a user