From 27778010fc9358fe669cebbf4a00539dfb49ea97 Mon Sep 17 00:00:00 2001 From: Yi Zhang <1109276519@qq.com> Date: Fri, 12 Sep 2025 11:53:42 +0800 Subject: [PATCH] fix dual stream bug (#10352) --- python/sglang/srt/models/qwen2_moe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index ffb619940..9291146d9 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -62,6 +62,7 @@ from sglang.srt.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ) from sglang.srt.managers.schedule_batch import global_server_args_dict +from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.two_batch_overlap import model_forward_maybe_tbo @@ -194,7 +195,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module): ) -> torch.Tensor: current_stream = torch.cuda.current_stream() self.alt_stream.wait_stream(current_stream) - shared_output = self._forward_shared_experts(hidden_states) + shared_output = self._forward_shared_experts(hidden_states.clone()) with torch.cuda.stream(self.alt_stream): router_output = self._forward_router_experts(hidden_states) @@ -217,6 +218,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module): self.alt_stream is not None and hidden_states.shape[0] > 0 and hidden_states.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD + and get_is_capture_mode() ): final_hidden_states, shared_output = self.forward_normal_dual_stream( hidden_states