Fuse routed_scaling_factor in DeepSeek (#6710)

2025-05-29 06:53:37 +08:00
parent 5170b010a6
commit 541a985f85
1 changed files with 7 additions and 3 deletions
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -526,9 +526,13 @@ class DeepseekV2MoE(nn.Module):
    def op_output(self, state):
        final_hidden_states = state.pop("hidden_states_after_combine")
        if (shared_output := state.pop("shared_output")) is not None:
            x = shared_output
            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
            final_hidden_states = x
        else:
            final_hidden_states *= self.routed_scaling_factor
        if (s := state.pop("shared_output")) is not None:
            final_hidden_states = final_hidden_states + s
        state.hidden_states_mlp_output = final_hidden_states