From 0d658ac3dfbf4459567c39e4875c011094e5342d Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 26 Mar 2025 03:29:57 -0400 Subject: [PATCH] Support recording experts workload in QWen2-MoE (#4775) --- python/sglang/srt/models/qwen2_moe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 5c3c8d612..7d40bdbf9 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -44,10 +44,13 @@ from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, ) +from sglang.srt.managers.utils import ExpertDistributionRecorder from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.utils import add_prefix +expert_distribution_recorder = ExpertDistributionRecorder() + class Qwen2MoeMLP(nn.Module): def __init__( @@ -366,6 +369,7 @@ class Qwen2MoeModel(nn.Module): hidden_states = input_embeds residual = None for i in range(len(self.layers)): + expert_distribution_recorder.set_current_layer(i) layer = self.layers[i] hidden_states, residual = layer( positions, hidden_states, forward_batch, residual