[Feature]EPLB:Adapt DispatchGmmCombineDecode operator to eplb tensor list and expert token numbers (#5552)
#### What this PR does / why we need it?
This PR adapt DispatchGmmCombineDecode operator to eplb tensor list and
expert token numbers.
This operator support gmm1, gmm2, gmm1Scale and gmm2Scale in format of
list.
This operator support couting how many token each local expert recieves
by expertTokensNum .
- vLLM version: v0.13.0
- vLLM main:
7157596103
More info about this operator, please refer to RFC: issue
https://github.com/vllm-project/vllm-ascend/issues/5476
This commit is contained in:
@@ -54,12 +54,15 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
self.model.model.layers[i].mlp.experts.w13_weight_scale_fp32_list
|
||||
self.param_dict["model.layers." + str(i) + ".mlp.experts." + "w2_weight_scale_list"] = \
|
||||
self.model.model.layers[i].mlp.experts.w2_weight_scale_list
|
||||
self.param_dict["model.layers." + str(i) + ".mlp.experts." + "w2_weight_scale_fp32_list"] = \
|
||||
self.model.model.layers[i].mlp.experts.w2_weight_scale_fp32_list
|
||||
# TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 and qwen3-moe is supported here
|
||||
if self.model.quant_config is not None:
|
||||
self.expert_weight_names = [
|
||||
"w13_weight_list", "w2_weight_list",
|
||||
"w13_weight_scale_fp32_list", "w13_weight_offset",
|
||||
"w2_weight_scale_list", "w2_weight_offset"
|
||||
"w2_weight_scale_list", "w2_weight_offset",
|
||||
"w2_weight_scale_fp32_list"
|
||||
]
|
||||
else:
|
||||
self.expert_weight_names = ["w13_weight", "w2_weight"]
|
||||
@@ -97,7 +100,8 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
self.num_dense_layers) + ".mlp.experts." + name
|
||||
if name in [
|
||||
"w13_weight_list", "w2_weight_list",
|
||||
"w13_weight_scale_fp32_list", "w2_weight_scale_list"
|
||||
"w13_weight_scale_fp32_list", "w2_weight_scale_list",
|
||||
"w2_weight_scale_fp32_list"
|
||||
]:
|
||||
expert_tensor = self.param_dict[complete_name][0]
|
||||
expert_tensor = expert_tensor.clone()
|
||||
@@ -118,7 +122,7 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
if name in [
|
||||
"w13_weight_list", "w2_weight_list",
|
||||
"w13_weight_scale_fp32_list",
|
||||
"w2_weight_scale_list"
|
||||
"w2_weight_scale_list", "w2_weight_scale_fp32_list"
|
||||
]:
|
||||
per_expert_param.append(
|
||||
self.param_dict["model.layers." + str(layer_idx) +
|
||||
|
||||
Reference in New Issue
Block a user