From 0d658ac3dfbf4459567c39e4875c011094e5342d Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 26 Mar 2025 03:29:57 -0400
Subject: [PATCH] Support recording experts workload in QWen2-MoE (#4775)

---
 python/sglang/srt/models/qwen2_moe.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py
index 5c3c8d612..7d40bdbf9 100644
--- a/python/sglang/srt/models/qwen2_moe.py
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -44,10 +44,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
 )
+from sglang.srt.managers.utils import ExpertDistributionRecorder
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.utils import add_prefix
 
+expert_distribution_recorder = ExpertDistributionRecorder()
+
 
 class Qwen2MoeMLP(nn.Module):
     def __init__(
@@ -366,6 +369,7 @@ class Qwen2MoeModel(nn.Module):
             hidden_states = input_embeds
         residual = None
         for i in range(len(self.layers)):
+            expert_distribution_recorder.set_current_layer(i)
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions, hidden_states, forward_batch, residual