From adca585bfb59a6c29cf18393b4a68bd5b4068f08 Mon Sep 17 00:00:00 2001 From: yulei Date: Mon, 14 Apr 2025 07:03:09 +0800 Subject: [PATCH] [DeepEP] Reduce routed scaling overhead (#5277) Co-authored-by: Cheng Wan <54331508+ch-wan@users.noreply.github.com> --- python/sglang/srt/models/deepseek_v2.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 42ae9d293..d581200cf 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -337,16 +337,13 @@ class DeepseekV2MoE(nn.Module): topk_weights, forward_mode=forward_mode, ) - final_hidden_states = ( - self.experts( - hidden_states=hidden_states, - reorder_topk_ids=reorder_topk_ids, - seg_indptr=seg_indptr, - masked_m=masked_m, - expected_m=expected_m, - forward_mode=forward_mode, - ) - * self.routed_scaling_factor + final_hidden_states = self.experts( + hidden_states=hidden_states, + reorder_topk_ids=reorder_topk_ids, + seg_indptr=seg_indptr, + masked_m=masked_m, + expected_m=expected_m, + forward_mode=forward_mode, ) if self.ep_size > 1: final_hidden_states = self.deepep_dispatcher.combine( @@ -355,6 +352,8 @@ class DeepseekV2MoE(nn.Module): topk_weights, forward_mode, ) + final_hidden_states *= self.routed_scaling_factor + if shared_output is not None: final_hidden_states = final_hidden_states + shared_output