From bdbb8d009ae10dae19b4bfee37047ccd2f3e4b2a Mon Sep 17 00:00:00 2001 From: JieXin Liang Date: Tue, 24 Jun 2025 03:05:30 +0800 Subject: [PATCH] [perf] slightly imporve DeepSeek-R1-FP4 TP8 (#7481) --- python/sglang/srt/models/deepseek_v2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index ccb5fd254..83b5c833e 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -362,12 +362,14 @@ class DeepseekV2MoE(nn.Module): return self.forward_deepep(hidden_states, forward_batch) def forward_normal_dual_stream(self, hidden_states: torch.Tensor) -> torch.Tensor: + # router_logits: (num_tokens, n_experts) + router_logits = self.gate(hidden_states) + current_stream = torch.cuda.current_stream() self.alt_stream.wait_stream(current_stream) shared_output = self._forward_shared_experts(hidden_states) + with torch.cuda.stream(self.alt_stream): - # router_logits: (num_tokens, n_experts) - router_logits = self.gate(hidden_states) final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits )