[EPLB][Bugfix] Dispatch Allgather use log2phy if enable eplb (#5933)
### What this PR does / why we need it?
1. Move the logic of expert mapping forward to prevent shotgun changes
2. Disable the update of expert map.
### How was this patch tested?
a2
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| GPQA_diamond | 53064e | accuracy | gen | 73.23 |
a3
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 83.33 |
- vLLM version: v0.13.0
- vLLM main:
11b6af5280
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -33,12 +33,7 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
self.rank_id = dist.get_rank()
|
||||
self.world_size = dist.get_world_size()
|
||||
self.param_dict = dict(self.model.named_parameters())
|
||||
if self.model.config.model_type == "qwen3_moe":
|
||||
self.num_dense_layers = 0
|
||||
self.global_expert_num = self.model.config.num_experts
|
||||
else:
|
||||
self.num_dense_layers = self.model.config.first_k_dense_replace
|
||||
self.global_expert_num = self.model.config.n_routed_experts
|
||||
self.num_dense_layers = getattr(self.model.config, "first_k_dense_replace", 0)
|
||||
self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers
|
||||
|
||||
for i in range(self.num_dense_layers,
|
||||
@@ -64,17 +59,10 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
else:
|
||||
self.expert_weight_names = ["w13_weight", "w2_weight"]
|
||||
|
||||
self.expert_map_per_layer = dict(
|
||||
) # reference to expert map on device for expert map update
|
||||
self.expert_map_per_layer_cpu = dict(
|
||||
) # copy of expert map on CPU to avoid device synchronize frequently
|
||||
for layer_idx in range(self.num_moe_layers):
|
||||
self.expert_map_per_layer[self.num_dense_layers + layer_idx] = \
|
||||
self.model.get_expert_map(self.num_dense_layers + layer_idx)
|
||||
|
||||
# TODO: here we set number of buffer tensor equal to number of expert in each laryer, which can be improved
|
||||
num_buffer_tensor = torch.where(
|
||||
self.expert_map_per_layer[self.num_dense_layers] != -1)[0].numel()
|
||||
num_buffer_tensor = self.model.model.layers[-1].mlp.experts.local_num_experts
|
||||
self.buffer_tensor_list: list[list[Any]] = [
|
||||
[] for _ in range(num_buffer_tensor)
|
||||
]
|
||||
@@ -88,8 +76,6 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] = \
|
||||
self.model.get_log2phy_map(self.num_dense_layers + layer_idx)
|
||||
|
||||
self.all_topk_ids = []
|
||||
|
||||
def init_buffer_tensor(self, num_buffer_tensor):
|
||||
for buffer_id in range(num_buffer_tensor):
|
||||
for name in self.expert_weight_names:
|
||||
@@ -169,7 +155,6 @@ class VllmEplbAdaptor(EplbAdaptor):
|
||||
json.dump(record, f, indent=4)
|
||||
|
||||
def do_update_expert_map(self, layer_id, updated_expert_map):
|
||||
self.expert_map_per_layer[layer_id].copy_(updated_expert_map)
|
||||
self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map)
|
||||
|
||||
def do_update_expert_weight(self, layer_id, local_expert_to_replace,
|
||||
|
||||
@@ -38,7 +38,6 @@ class EplbUpdator:
|
||||
def set_adaptor(self, adaptor):
|
||||
self.adaptor = adaptor
|
||||
self.num_moe_layers = self.adaptor.num_moe_layers
|
||||
self.global_expert_num = self.adaptor.global_expert_num
|
||||
|
||||
def init_eplb(self, expert_map_path, process):
|
||||
self.rank_id = dist.get_rank()
|
||||
|
||||
Reference in New Issue
Block a user