From 541a985f85bc59b9a38cf4bf0132ce795694ff3c Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Thu, 29 May 2025 06:53:37 +0800 Subject: [PATCH] Fuse routed_scaling_factor in DeepSeek (#6710) --- python/sglang/srt/models/deepseek_v2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 5252c2411..29f18f0ef 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -526,9 +526,13 @@ class DeepseekV2MoE(nn.Module): def op_output(self, state): final_hidden_states = state.pop("hidden_states_after_combine") - final_hidden_states *= self.routed_scaling_factor - if (s := state.pop("shared_output")) is not None: - final_hidden_states = final_hidden_states + s + + if (shared_output := state.pop("shared_output")) is not None: + x = shared_output + x.add_(final_hidden_states, alpha=self.routed_scaling_factor) + final_hidden_states = x + else: + final_hidden_states *= self.routed_scaling_factor state.hidden_states_mlp_output = final_hidden_states