Expert distribution recording without overhead for EPLB (#4957)
This commit is contained in:
@@ -18,7 +18,10 @@ from typing import Callable, Optional
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
|
||||
from sglang.srt.managers.expert_distribution import (
|
||||
ExpertDistributionRecorder,
|
||||
get_global_expert_distribution_recorder,
|
||||
)
|
||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||
from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
|
||||
|
||||
@@ -31,8 +34,6 @@ if _is_cuda:
|
||||
if _is_cuda or _is_hip:
|
||||
from sgl_kernel import topk_softmax
|
||||
|
||||
expert_distribution_recorder = ExpertDistributionRecorder()
|
||||
|
||||
|
||||
def fused_topk_native(
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -353,6 +354,6 @@ def select_experts(
|
||||
renormalize=renormalize,
|
||||
)
|
||||
|
||||
expert_distribution_recorder.record_new_token(topk_ids)
|
||||
get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)
|
||||
|
||||
return topk_weights, topk_ids
|
||||
|
||||
Reference in New Issue
Block a user