Fuse routed_scaling_factor in DeepSeek (#6710)

2025-05-29 06:53:37 +08:00
parent 5170b010a6
commit 541a985f85
1 changed files with 7 additions and 3 deletions
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -526,9 +526,13 @@ class DeepseekV2MoE(nn.Module):

    def op_output(self, state):
        final_hidden_states = state.pop("hidden_states_after_combine")
-        final_hidden_states *= self.routed_scaling_factor
-        if (s := state.pop("shared_output")) is not None:
-            final_hidden_states = final_hidden_states + s
+
+        if (shared_output := state.pop("shared_output")) is not None:
+            x = shared_output
+            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
+        else:
+            final_hidden_states *= self.routed_scaling_factor

        state.hidden_states_mlp_output = final_hidden_states