Fuse routed scaling factor in deepseek (#6970)

This commit is contained in:
Xiaoyu Zhang
2025-06-09 06:24:24 +08:00
committed by GitHub
parent 971a0dfa32
commit 3712abfaf9
10 changed files with 338 additions and 15 deletions

View File

@@ -346,7 +346,8 @@ class DeepseekV2MoE(nn.Module):
final_hidden_states = self.experts(
hidden_states=hidden_states, router_logits=router_logits
)
final_hidden_states *= self.routed_scaling_factor
if not _is_cuda:
final_hidden_states *= self.routed_scaling_factor
if shared_output is not None:
final_hidden_states = final_hidden_states + shared_output
if self.tp_size > 1: