2025-06-09 19:28:11 +08:00
|
|
|
import json
|
|
|
|
|
import random
|
|
|
|
|
from typing import Dict, List
|
|
|
|
|
|
|
|
|
|
import torch
|
2025-10-24 17:10:31 +08:00
|
|
|
import torch.distributed as dist
|
2025-06-09 19:28:11 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class ExpertLoadBalancer(object):
|
|
|
|
|
|
2025-12-03 12:00:05 +08:00
|
|
|
def __init__(self, expert_map_path, num_experts):
|
2025-06-09 19:28:11 +08:00
|
|
|
self.expert_map_path = expert_map_path
|
2025-12-03 12:00:05 +08:00
|
|
|
self.num_experts = num_experts
|
2025-10-24 17:10:31 +08:00
|
|
|
self.tensor_data = []
|
2025-06-09 19:28:11 +08:00
|
|
|
self.expert_map_tensor, self.layers_num, self.ranks_num = (
|
|
|
|
|
self._expert_file_to_tensor())
|
2025-12-03 12:00:05 +08:00
|
|
|
self.global_expert_num = num_experts + self.get_global_redundant_expert_num(
|
|
|
|
|
)
|
2025-10-24 17:10:31 +08:00
|
|
|
self.expert_placement_map = self.generate_expert_placement_map()
|
2025-06-09 19:28:11 +08:00
|
|
|
|
|
|
|
|
def _expert_file_to_tensor(self):
|
|
|
|
|
with open(self.expert_map_path, "r") as f:
|
|
|
|
|
data = json.load(f)
|
|
|
|
|
layers_num = data["moe_layer_count"]
|
|
|
|
|
gpus_num = data["layer_list"][0]["device_count"]
|
|
|
|
|
|
|
|
|
|
for layer in data["layer_list"]:
|
|
|
|
|
device_data = []
|
|
|
|
|
for device in layer["device_list"]:
|
|
|
|
|
device_data.append(device["device_expert"])
|
2025-10-24 17:10:31 +08:00
|
|
|
self.tensor_data.append(device_data)
|
|
|
|
|
expert_map_tensor = torch.tensor(self.tensor_data, dtype=torch.int32)
|
2025-06-09 19:28:11 +08:00
|
|
|
return expert_map_tensor, layers_num, gpus_num
|
|
|
|
|
|
|
|
|
|
def generate_index_dicts(self, tensor_2d):
|
|
|
|
|
dict_list = []
|
|
|
|
|
current_idx = 0
|
|
|
|
|
|
|
|
|
|
for row in tensor_2d:
|
|
|
|
|
value_to_index = {}
|
|
|
|
|
for i in range(row.size(0)):
|
|
|
|
|
value = row[i].item()
|
|
|
|
|
value_to_index[value] = current_idx + i
|
|
|
|
|
dict_list.append(value_to_index)
|
|
|
|
|
current_idx += row.size(0)
|
|
|
|
|
|
|
|
|
|
return dict_list
|
|
|
|
|
|
|
|
|
|
def generate_expert_placement_map(self):
|
|
|
|
|
expert_placement_map = torch.full(
|
BugFix: Resolve shape mismatch in eplb update and calculation issues in quant_apply_mlp (#4777)
## Description
This PR addresses two key issues in the MoE module when redundant
experts are enabled, and fixes a calculation precision bug in the
forward inference of quantized MLP:
### 1. Shape Mismatch in EPLB Expert Map Update
- **Root Cause**:
When redundant experts are turned on, a shape inconsistency occurs
during the expert map update in `Vllm_apaptor`:
- The shape of `self.expert_map_per_layer[layer_id]` is
`[num_physical_experts,]` (aligned with physical expert count).
- The shape of `updated_expert_map` is `[num_logical_experts,]` (aligned
with logical expert count).
- Indices in `self.expert_map_per_layer[layer_id]` that exceed the
logical expert count cannot be properly mapped, leading to tensor shape
mismatch errors.
- The same shape mismatch exists in the `log2phy` map update (between
`self.log2phy_map_per_layer[layer_id]` and `updated_log2phy_map`).
- **Fix**:
- Fix the shape initialization of `expert_map_per_layer` and
`log2phy_map_per_layer` to be consistently set to
`[num_physical_experts,]` across the module lifecycle.
- Align the shape of `updated_expert_map` and `updated_log2phy_map` with
the pre-initialized physical-expert-sized tensors during update
operations, ensuring shape consistency for index mapping.
### 2. Calculation Precision Issue in Quantized MoE MLP Forward
Inference
- **Root Cause**:
In the forward pass of `moe_mlp`, the
`torch_npu.npu_dequant_swiglu_quant` operator only accepts group lists
in **Count format** as input. However, the group list provided by
`quant_apply_mlp` was in **Cumsum format**, which caused operator input
format mismatch and degraded calculation precision.
- **Fix**:
- Convert the cumsum-formatted group list from `quant_apply_mlp` to
Count format before passing it to `torch_npu.npu_dequant_swiglu_quant`.
- Ensure the input format of the dequantization operator meets its
requirements, restoring the expected calculation precision for quantized
MoE MLP layers.
## Impact
- Resolves shape mismatch errors in EPLB expert/log2phy map updates when
redundant experts are enabled, ensuring stable expert routing.
- Fixes quantized MoE MLP forward precision issues on NPU, aligning
operator input formats with NPU kernel requirements.
- No breaking changes to existing interfaces; the fixes are
backward-compatible for scenarios without redundant experts enabled.
---------
Signed-off-by: Che Ruan <cr623@ic.ac.uk>
Signed-off-by: Mercykid-bash <ruanche0218@gmail.com>
Co-authored-by: Che Ruan <cr623@ic.ac.uk>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-09 15:46:58 +08:00
|
|
|
(self.layers_num, self.ranks_num, self.num_experts),
|
2025-06-09 19:28:11 +08:00
|
|
|
-1,
|
|
|
|
|
dtype=torch.int32,
|
|
|
|
|
)
|
|
|
|
|
for layer_id in range(self.layers_num):
|
|
|
|
|
for gpu_id in range(self.ranks_num):
|
|
|
|
|
e_ids = self.expert_map_tensor[layer_id, gpu_id]
|
|
|
|
|
expert_placement_map[layer_id, gpu_id,
|
|
|
|
|
e_ids] = torch.arange(len(e_ids),
|
|
|
|
|
dtype=torch.int32)
|
|
|
|
|
return expert_placement_map
|
|
|
|
|
|
|
|
|
|
def generate_log2phy_expert_map(self, layer_id):
|
|
|
|
|
concatenated = torch.flatten(self.expert_map_tensor[layer_id])
|
|
|
|
|
rank_expert_to_global = self.generate_index_dicts(
|
|
|
|
|
self.expert_map_tensor[layer_id])
|
|
|
|
|
result_dict: Dict[int, List[int]] = {}
|
|
|
|
|
for idx, value in enumerate(concatenated):
|
|
|
|
|
key = value.item()
|
|
|
|
|
if key not in result_dict:
|
|
|
|
|
result_dict[key] = []
|
|
|
|
|
result_dict[key].append(idx)
|
|
|
|
|
|
BugFix: Resolve shape mismatch in eplb update and calculation issues in quant_apply_mlp (#4777)
## Description
This PR addresses two key issues in the MoE module when redundant
experts are enabled, and fixes a calculation precision bug in the
forward inference of quantized MLP:
### 1. Shape Mismatch in EPLB Expert Map Update
- **Root Cause**:
When redundant experts are turned on, a shape inconsistency occurs
during the expert map update in `Vllm_apaptor`:
- The shape of `self.expert_map_per_layer[layer_id]` is
`[num_physical_experts,]` (aligned with physical expert count).
- The shape of `updated_expert_map` is `[num_logical_experts,]` (aligned
with logical expert count).
- Indices in `self.expert_map_per_layer[layer_id]` that exceed the
logical expert count cannot be properly mapped, leading to tensor shape
mismatch errors.
- The same shape mismatch exists in the `log2phy` map update (between
`self.log2phy_map_per_layer[layer_id]` and `updated_log2phy_map`).
- **Fix**:
- Fix the shape initialization of `expert_map_per_layer` and
`log2phy_map_per_layer` to be consistently set to
`[num_physical_experts,]` across the module lifecycle.
- Align the shape of `updated_expert_map` and `updated_log2phy_map` with
the pre-initialized physical-expert-sized tensors during update
operations, ensuring shape consistency for index mapping.
### 2. Calculation Precision Issue in Quantized MoE MLP Forward
Inference
- **Root Cause**:
In the forward pass of `moe_mlp`, the
`torch_npu.npu_dequant_swiglu_quant` operator only accepts group lists
in **Count format** as input. However, the group list provided by
`quant_apply_mlp` was in **Cumsum format**, which caused operator input
format mismatch and degraded calculation precision.
- **Fix**:
- Convert the cumsum-formatted group list from `quant_apply_mlp` to
Count format before passing it to `torch_npu.npu_dequant_swiglu_quant`.
- Ensure the input format of the dequantization operator meets its
requirements, restoring the expected calculation precision for quantized
MoE MLP layers.
## Impact
- Resolves shape mismatch errors in EPLB expert/log2phy map updates when
redundant experts are enabled, ensuring stable expert routing.
- Fixes quantized MoE MLP forward precision issues on NPU, aligning
operator input formats with NPU kernel requirements.
- No breaking changes to existing interfaces; the fixes are
backward-compatible for scenarios without redundant experts enabled.
---------
Signed-off-by: Che Ruan <cr623@ic.ac.uk>
Signed-off-by: Mercykid-bash <ruanche0218@gmail.com>
Co-authored-by: Che Ruan <cr623@ic.ac.uk>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-09 15:46:58 +08:00
|
|
|
log2phy_map = torch.full((self.ranks_num, self.num_experts),
|
2025-06-09 19:28:11 +08:00
|
|
|
-1,
|
|
|
|
|
dtype=torch.int32)
|
|
|
|
|
for rank in range(self.ranks_num):
|
|
|
|
|
for key in result_dict:
|
|
|
|
|
indices_in_concat = result_dict[key]
|
|
|
|
|
if key in rank_expert_to_global[rank]:
|
|
|
|
|
log2phy_map[rank][key] = rank_expert_to_global[rank][key]
|
|
|
|
|
else:
|
|
|
|
|
chosen_index = random.choice(indices_in_concat)
|
|
|
|
|
log2phy_map[rank][key] = chosen_index
|
|
|
|
|
return log2phy_map
|
|
|
|
|
|
|
|
|
|
def get_rank_placement_map(self, layer_id, rank_id):
|
2025-10-24 17:10:31 +08:00
|
|
|
layer_expert_map = self.expert_placement_map[layer_id]
|
2025-06-09 19:28:11 +08:00
|
|
|
rank_expert_map = layer_expert_map[rank_id].to(
|
|
|
|
|
torch.npu.current_device())
|
|
|
|
|
rank_local_expert_num = torch.sum(torch.ne(rank_expert_map, -1)).item()
|
|
|
|
|
return rank_local_expert_num, rank_expert_map
|
|
|
|
|
|
|
|
|
|
def get_rank_log2phy_map(self, layer_id, rank_id):
|
|
|
|
|
layer_log2phy_map = self.generate_log2phy_expert_map(layer_id)
|
|
|
|
|
return layer_log2phy_map[rank_id]
|
|
|
|
|
|
|
|
|
|
def get_global_redundant_expert_num(self):
|
|
|
|
|
global_redundant_expert_num = (
|
|
|
|
|
len(self.expert_map_tensor[0][0]) * self.ranks_num -
|
2025-12-03 12:00:05 +08:00
|
|
|
self.num_experts)
|
2025-06-09 19:28:11 +08:00
|
|
|
return global_redundant_expert_num
|
2025-10-24 17:10:31 +08:00
|
|
|
|
|
|
|
|
def check_expert_map_tensor(self):
|
|
|
|
|
if dist.is_initialized():
|
|
|
|
|
try:
|
|
|
|
|
rank = dist.get_rank()
|
|
|
|
|
world_size = dist.get_world_size()
|
|
|
|
|
all_expert_maps = [None for _ in range(world_size)]
|
|
|
|
|
dist.all_gather_object(all_expert_maps, self.tensor_data)
|
|
|
|
|
for rank_id, expert_map_tensor in enumerate(all_expert_maps):
|
|
|
|
|
if self.tensor_data != expert_map_tensor:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"The expert map of rank{rank} is not equal to rank{rank_id}"
|
|
|
|
|
)
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"The expert maps of all ranks are inconsistency: {e}")
|