Expert distribution recording without overhead for EPLB (#4957)

2025-05-20 11:07:43 +08:00
parent b146555749
commit f0653886a5
12 changed files with 1123 additions and 194 deletions
--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -18,7 +18,10 @@ from typing import Callable, Optional
 import torch
 import torch.nn.functional as F

-from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
+from sglang.srt.managers.expert_distribution import (
+    ExpertDistributionRecorder,
+    get_global_expert_distribution_recorder,
+)
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip

@@ -31,8 +34,6 @@ if _is_cuda:
 if _is_cuda or _is_hip:
    from sgl_kernel import topk_softmax

-expert_distribution_recorder = ExpertDistributionRecorder()
-

 def fused_topk_native(
    hidden_states: torch.Tensor,
@@ -353,6 +354,6 @@ def select_experts(
            renormalize=renormalize,
        )

-    expert_distribution_recorder.record_new_token(topk_ids)
+    get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)

    return topk_weights, topk_ids