Super tiny enable sole usage of expert distribution metrics and update doc (#6680)
This commit is contained in:
@@ -27,7 +27,8 @@ class EPLBManager:
|
|||||||
<= self._server_args.expert_distribution_recorder_buffer_size
|
<= self._server_args.expert_distribution_recorder_buffer_size
|
||||||
), "eplb_rebalance_num_iterations must be less than expert_distribution_recorder_buffer_size"
|
), "eplb_rebalance_num_iterations must be less than expert_distribution_recorder_buffer_size"
|
||||||
|
|
||||||
get_global_expert_distribution_recorder().start_record()
|
if not get_global_expert_distribution_recorder().recording:
|
||||||
|
get_global_expert_distribution_recorder().start_record()
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[EPLBManager] system started, will rebalance per {self._server_args.eplb_rebalance_num_iterations} iterations."
|
f"[EPLBManager] system started, will rebalance per {self._server_args.eplb_rebalance_num_iterations} iterations."
|
||||||
|
|||||||
@@ -91,6 +91,10 @@ class ExpertDistributionRecorder(ABC):
|
|||||||
def dump_record(self, output_mode: _OutputMode = "file"):
|
def dump_record(self, output_mode: _OutputMode = "file"):
|
||||||
self._on_not_implemented()
|
self._on_not_implemented()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def recording(self):
|
||||||
|
return False
|
||||||
|
|
||||||
def _on_not_implemented(self):
|
def _on_not_implemented(self):
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
|
"Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
|
||||||
@@ -123,6 +127,12 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|||||||
for k in self._accumulator.get_single_pass_gatherer_keys()
|
for k in self._accumulator.get_single_pass_gatherer_keys()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if server_args.enable_expert_distribution_metrics:
|
||||||
|
logger.info(
|
||||||
|
"ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
|
||||||
|
)
|
||||||
|
self.start_record()
|
||||||
|
|
||||||
def with_current_layer(self, layer_idx):
|
def with_current_layer(self, layer_idx):
|
||||||
return self._current_layer_idx.with_value(layer_idx)
|
return self._current_layer_idx.with_value(layer_idx)
|
||||||
|
|
||||||
@@ -221,6 +231,10 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|||||||
self._reset()
|
self._reset()
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
@property
|
||||||
|
def recording(self):
|
||||||
|
return self._recording
|
||||||
|
|
||||||
|
|
||||||
_global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
|
_global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
|
||||||
_ExpertDistributionRecorderNoop()
|
_ExpertDistributionRecorderNoop()
|
||||||
|
|||||||
@@ -1355,7 +1355,7 @@ class ServerArgs:
|
|||||||
"--deepep-config",
|
"--deepep-config",
|
||||||
type=str,
|
type=str,
|
||||||
default=ServerArgs.deepep_config,
|
default=ServerArgs.deepep_config,
|
||||||
help="Tuned DeepEP config suitable for your own cluster.",
|
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
Reference in New Issue
Block a user