[2/2] Fuse routed scaling factor into select_experts (#8690)

This commit is contained in:
Trevor Morris
2025-08-20 15:10:16 -07:00
committed by GitHub
parent f96413c444
commit a91e90d9a3
6 changed files with 55 additions and 25 deletions

View File

@@ -1305,8 +1305,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
tp_rank=layer.moe_tp_rank,
tune_max_num_tokens=next_power_of_2(x.shape[0]),
)[0]
if moe_runner_config.routed_scaling_factor is not None:
output *= moe_runner_config.routed_scaling_factor
# Scale by routed_scaling_factor is fused into select_experts.
if should_use_flashinfer_cutlass_moe_fp4_allgather():
output, global_output = get_local_dp_buffer(), output
get_tp_group().reduce_scatterv(
@@ -1332,6 +1331,5 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
params=layer.cutlass_moe_params,
apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
).to(x.dtype)
if moe_runner_config.routed_scaling_factor is not None:
output *= moe_runner_config.routed_scaling_factor
# Scale by routed_scaling_factor is fused into select_experts.
return output