Support dynamically rebalancing experts using EPLB (#6469)
This commit is contained in:
@@ -95,6 +95,8 @@ def update_expert_weights_single_layer(
|
||||
tensor.shape[0] == num_local_physical_experts
|
||||
for tensor in routed_experts_weights
|
||||
), f"{num_local_physical_experts=} {[x.shape for x in routed_experts_weights]=}"
|
||||
assert isinstance(old_physical_to_logical_map, list)
|
||||
assert isinstance(new_physical_to_logical_map, list)
|
||||
|
||||
output_logs = [] if debug else None
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ from sglang.srt.layers.quantization.deep_gemm import (
|
||||
from sglang.srt.layers.sampler import Sampler
|
||||
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
|
||||
from sglang.srt.lora.lora_manager import LoRAManager
|
||||
from sglang.srt.managers.eplb_manager import EPLBManager
|
||||
from sglang.srt.managers.expert_distribution import (
|
||||
ExpertDistributionRecorder,
|
||||
get_global_expert_distribution_recorder,
|
||||
@@ -255,6 +256,12 @@ class ModelRunner:
|
||||
)
|
||||
)
|
||||
|
||||
self.eplb_manager = (
|
||||
EPLBManager(self)
|
||||
if self.server_args.enable_eplb and (not self.is_draft_worker)
|
||||
else None
|
||||
)
|
||||
|
||||
# Load the model
|
||||
self.sampler = Sampler()
|
||||
self.load_model()
|
||||
@@ -1152,10 +1159,15 @@ class ModelRunner:
|
||||
self.forward_pass_id,
|
||||
forward_batch,
|
||||
):
|
||||
return self._forward_raw(
|
||||
output = self._forward_raw(
|
||||
forward_batch, skip_attn_backend_init, pp_proxy_tensors
|
||||
)
|
||||
|
||||
if self.eplb_manager is not None:
|
||||
self.eplb_manager.on_forward_pass_end(self.forward_pass_id)
|
||||
|
||||
return output
|
||||
|
||||
def _forward_raw(
|
||||
self,
|
||||
forward_batch: ForwardBatch,
|
||||
|
||||
Reference in New Issue
Block a user