### What this PR does / why we need it?
| File Path |
| :--- |
| ` vllm_ascend/eplb/adaptor/abstract_adaptor.py` |
| ` vllm_ascend/eplb/adaptor/vllm_adaptor.py` |
| ` vllm_ascend/eplb/core/eplb_device_transfer_loader.py` |
| ` vllm_ascend/eplb/core/eplb_utils.py` |
| ` vllm_ascend/eplb/core/eplb_worker.py` |
| ` vllm_ascend/eplb/core/policy/policy_abstract.py` |
| ` vllm_ascend/eplb/core/policy/policy_default_eplb.py` |
| ` vllm_ascend/eplb/core/policy/policy_factory.py` |
| ` vllm_ascend/eplb/core/policy/policy_flashlb.py` |
| ` vllm_ascend/eplb/core/policy/policy_random.py` |
| ` vllm_ascend/eplb/core/policy/policy_swift_balancer.py` |
| ` vllm_ascend/eplb/eplb_updator.py` |
| ` vllm_ascend/eplb/utils.py` |
| ` vllm_ascend/model_loader/netloader/executor/elastic_load.py` |
| ` vllm_ascend/model_loader/netloader/executor/netloader_pg.py` |
| ` vllm_ascend/model_loader/netloader/interaction/elastic.py` |
| ` vllm_ascend/model_loader/netloader/load.py` |
| ` vllm_ascend/model_loader/netloader/netloader.py` |
| ` vllm_ascend/model_loader/netloader/utils.py` |
| ` vllm_ascend/patch/platform/__init__.py` |
| ` vllm_ascend/patch/platform/patch_balance_schedule.py` |
| ` vllm_ascend/patch/platform/patch_ec_connector.py` |
| ` vllm_ascend/patch/platform/patch_mamba_config.py` |
| ` vllm_ascend/patch/platform/patch_multiproc_executor.py` |
| ` vllm_ascend/patch/platform/patch_sched_yield.py` |
- vLLM version: v0.13.0
- vLLM main:
2c24bc6996
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
This commit is contained in:
@@ -26,9 +26,7 @@ from vllm_ascend.eplb.core.eplb_worker import EplbProcess
|
||||
|
||||
|
||||
class EplbUpdator:
|
||||
|
||||
def __init__(self, eplb_config, loader, eplb_process: EplbProcess,
|
||||
process):
|
||||
def __init__(self, eplb_config, loader, eplb_process: EplbProcess, process):
|
||||
self.eplb_config = eplb_config
|
||||
self.init_eplb(self.eplb_config.expert_map_path, process)
|
||||
self.eplb_loader = loader
|
||||
@@ -43,9 +41,7 @@ class EplbUpdator:
|
||||
self.world_size = dist.get_world_size()
|
||||
self.device = local_load.device
|
||||
shape = (self.world_size, *local_load.shape)
|
||||
self._gather_buffer = torch.empty(shape,
|
||||
dtype=local_load.dtype,
|
||||
device=self.device)
|
||||
self._gather_buffer = torch.empty(shape, dtype=local_load.dtype, device=self.device)
|
||||
|
||||
def init_eplb(self, expert_map_path, process):
|
||||
self.rank_id = dist.get_rank()
|
||||
@@ -72,52 +68,49 @@ class EplbUpdator:
|
||||
|
||||
self.process = process
|
||||
|
||||
logger.info(
|
||||
f"[ModelRunner] Launched EPLB process (pid={self.process.pid})")
|
||||
logger.info(f"[ModelRunner] Launched EPLB process (pid={self.process.pid})")
|
||||
|
||||
def update_iteration(self):
|
||||
self.cur_iterations += 1
|
||||
if self.cur_iterations == (self.expert_heat_collection_interval + \
|
||||
self.algorithm_execution_interval + self.num_moe_layers):
|
||||
if self.cur_iterations == (
|
||||
self.expert_heat_collection_interval + self.algorithm_execution_interval + self.num_moe_layers
|
||||
):
|
||||
logger.info("Finish expert parallel load balancing.")
|
||||
if self.expert_map_record_path is not None:
|
||||
self.adaptor._export_tensor_to_file(
|
||||
self.shared_dict["expert_maps"],
|
||||
self.expert_map_record_path)
|
||||
self.adaptor._export_tensor_to_file(self.shared_dict["expert_maps"], self.expert_map_record_path)
|
||||
|
||||
self.adaptor.model.clear_all_moe_loads()
|
||||
self.cur_iterations = 0
|
||||
|
||||
def get_update_info_flag(self):
|
||||
return self.cur_iterations == (self.expert_heat_collection_interval +
|
||||
self.algorithm_execution_interval - 1)
|
||||
return self.cur_iterations == (self.expert_heat_collection_interval + self.algorithm_execution_interval - 1)
|
||||
|
||||
def wakeup_eplb_worker_flag(self):
|
||||
return self.cur_iterations == (self.expert_heat_collection_interval -
|
||||
1)
|
||||
return self.cur_iterations == (self.expert_heat_collection_interval - 1)
|
||||
|
||||
def update_expert_weight_flag(self):
|
||||
weight_update_counter = self.cur_iterations - (
|
||||
self.expert_heat_collection_interval +
|
||||
self.algorithm_execution_interval)
|
||||
return (weight_update_counter >= 0
|
||||
and weight_update_counter < self.num_moe_layers)
|
||||
self.expert_heat_collection_interval + self.algorithm_execution_interval
|
||||
)
|
||||
return weight_update_counter >= 0 and weight_update_counter < self.num_moe_layers
|
||||
|
||||
def wakeup_eplb_worker(self):
|
||||
self.eplb_process.planner_q.put(1)
|
||||
|
||||
def forward_before(self):
|
||||
if self.update_expert_weight_flag():
|
||||
(expert_send_info, expert_recv_info, updated_expert_map,
|
||||
log2phy_map, layer_id) = self.update_info_all.pop(0)
|
||||
(expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(
|
||||
0
|
||||
)
|
||||
log2phy_map_this_rank = torch.from_numpy(numpy.array(log2phy_map))
|
||||
self.eplb_loader.set_log2phy_map(log2phy_map_this_rank)
|
||||
updated_expert_map_this_rank = torch.from_numpy(
|
||||
numpy.array(updated_expert_map))
|
||||
updated_expert_map_this_rank = torch.from_numpy(numpy.array(updated_expert_map))
|
||||
self.eplb_loader.generate_expert_d2d_transfer_task(
|
||||
expert_send_info, expert_recv_info,
|
||||
expert_send_info,
|
||||
expert_recv_info,
|
||||
updated_expert_map_this_rank,
|
||||
layer_id + self.adaptor.num_dense_layers)
|
||||
layer_id + self.adaptor.num_dense_layers,
|
||||
)
|
||||
|
||||
# set asynchronous stream for d2d expert weight update
|
||||
self.reqs = []
|
||||
@@ -133,8 +126,7 @@ class EplbUpdator:
|
||||
self.compute_and_set_moe_load()
|
||||
self.wakeup_eplb_worker()
|
||||
|
||||
if self.update_expert_weight_flag(
|
||||
) and self.expert_map_record_path is None:
|
||||
if self.update_expert_weight_flag() and self.expert_map_record_path is None:
|
||||
self.eplb_loader.update_expert_map_and_weight(self.reqs)
|
||||
|
||||
self.update_iteration()
|
||||
@@ -145,9 +137,7 @@ class EplbUpdator:
|
||||
|
||||
moe_load = self._gather_buffer.permute(1, 0, 2)
|
||||
self.shared_dict["moe_load"] = moe_load.cpu()
|
||||
logger.debug(
|
||||
f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}"
|
||||
)
|
||||
logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}")
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
self.compute_moe_imbalance(moe_load)
|
||||
@@ -156,7 +146,6 @@ class EplbUpdator:
|
||||
return moe_load
|
||||
|
||||
def compute_moe_imbalance(self, moe_load: torch.Tensor):
|
||||
|
||||
self.moe_imbalance_dict.clear()
|
||||
|
||||
layer_card_load = moe_load.sum(dim=-1).cpu().float()
|
||||
@@ -169,13 +158,11 @@ class EplbUpdator:
|
||||
|
||||
moe_load_imbalance = max_load / (mean_load + 1e-6)
|
||||
|
||||
logger.debug(f"[ModelRunner][MOE_load_stats][Layer {layer_idx}] "
|
||||
f"PAR={moe_load_imbalance:.4f}")
|
||||
logger.debug(f"[ModelRunner][MOE_load_stats][Layer {layer_idx}] PAR={moe_load_imbalance:.4f}")
|
||||
|
||||
self.moe_imbalance_dict[layer_idx] = moe_load_imbalance
|
||||
|
||||
def summarize_moe_imbalance(self):
|
||||
|
||||
values = list(self.moe_imbalance_dict.values())
|
||||
if not values:
|
||||
logger.info("[MOE_load_stats] No data available.")
|
||||
@@ -191,11 +178,10 @@ class EplbUpdator:
|
||||
)
|
||||
|
||||
def warm_up_eplb(self):
|
||||
|
||||
self.shared_dict["expert_maps"] = self.adaptor.get_global_expert_map()
|
||||
self.compute_and_set_moe_load()
|
||||
|
||||
src_tensor = torch.empty((1, ), device=self.device)
|
||||
src_tensor = torch.empty((1,), device=self.device)
|
||||
self_rank = dist.get_rank()
|
||||
|
||||
comm_op_list = []
|
||||
|
||||
Reference in New Issue
Block a user