[1/N][Draft][Refactor]torchair pangu_moe modeling refactor (#2437)

### What this PR does / why we need it?

1. Similar to #2384 , this PR add a torchair-specific modeling for
pangu.
2. Fixes a bug introduced by routed_scaling_factor in #2675 .
3. remove eager test case for pangu since there has already been a
torchair test case.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?


- vLLM version: v0.10.1.1
- vLLM main:
6997a25ac6

---------

Signed-off-by: zengyanjia <z00883269@china.huawei.com>
Signed-off-by: Angazenn <supperccell@163.com>
Co-authored-by: zengyanjia <z00883269@china.huawei.com>
This commit is contained in:
Angazenn
2025-09-04 10:39:21 +08:00
committed by GitHub
parent a58013440a
commit e7409e95ee
6 changed files with 1185 additions and 55 deletions

View File

@@ -170,15 +170,6 @@ def fused_experts_moge(
local_num_experts = global_num_experts // ep_size
local_num_group = top_k // ep_size
if apply_router_weight_on_input:
assert (topk_weights.dim() == 2
), "`topk_weights` should be in shape (num_tokens, topk)"
_, topk = topk_weights.shape
assert (
topk == 1
), "Only support topk=1 when `apply_router_weight_on_input` is True"
hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
bsz, _ = hidden_states.shape
flatten_topk_ids = topk_ids.view(-1)
sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
@@ -407,6 +398,7 @@ class AscendFusedMoE(FusedMoE):
prefix="",
custom_routing_function=None,
scoring_func="softmax",
routed_scaling_fator: float = 1.0,
e_score_correction_bias=None,
apply_router_weight_on_input=False,
activation="silu",
@@ -414,31 +406,59 @@ class AscendFusedMoE(FusedMoE):
num_redundant_experts=0,
has_bias=False,
):
super().__init__(
num_experts,
top_k,
hidden_size,
intermediate_size,
params_dtype,
reduce_results,
renormalize,
use_grouped_topk,
num_expert_group,
topk_group,
quant_config,
tp_size,
ep_size,
dp_size,
prefix,
custom_routing_function,
scoring_func,
e_score_correction_bias,
apply_router_weight_on_input,
activation,
enable_eplb,
num_redundant_experts,
has_bias,
)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
super().__init__(
num_experts,
top_k,
hidden_size,
intermediate_size,
params_dtype,
reduce_results,
renormalize,
use_grouped_topk,
num_expert_group,
topk_group,
quant_config,
tp_size,
ep_size,
dp_size,
prefix,
custom_routing_function,
scoring_func,
e_score_correction_bias,
apply_router_weight_on_input,
activation,
enable_eplb,
num_redundant_experts,
has_bias,
)
else:
super().__init__(
num_experts,
top_k,
hidden_size,
intermediate_size,
params_dtype,
reduce_results,
renormalize,
use_grouped_topk,
num_expert_group,
topk_group,
quant_config,
tp_size,
ep_size,
dp_size,
prefix,
custom_routing_function,
scoring_func,
routed_scaling_fator,
e_score_correction_bias,
apply_router_weight_on_input,
activation,
enable_eplb,
num_redundant_experts,
has_bias,
)
setup_token_dispatchers(self.moe_config.ep_size,
top_k=self.top_k,