[MOE][Bugfix] Cancel H2D for expert_map (#7000)
### What this PR does / why we need it?
If expert_map is on the device, there may be occasional repeated answers
in long output scenarios.
dsv3.2-exp-w8a8
No garbled characters are displayed in the output.
| dataset | version | metric | mode | vllm-api-stream-chat |
|----- | ----- | ----- | ----- | -----|
| aime2025 | ef2f4f | accuracy | gen | 60.00 |
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -22,6 +22,7 @@ from collections import defaultdict
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
|
||||
|
||||
|
||||
def expert_file_to_tensor(expert_map_path, layer_id):
|
||||
@@ -60,6 +61,11 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
||||
global_placement = None
|
||||
eplb_enable = eplb_config.dynamic_eplb
|
||||
n_redundant = eplb_config.num_redundant_experts if eplb_enable else 0
|
||||
|
||||
if ep_size == 1:
|
||||
assert not eplb_enable, "EPLB must used in expert parallelism."
|
||||
return None, None, None, n_redundant
|
||||
|
||||
if expert_map_path:
|
||||
if not (os.path.exists(expert_map_path) and os.access(expert_map_path, os.R_OK)):
|
||||
raise ValueError("Invalid EPLB path")
|
||||
@@ -71,13 +77,13 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
||||
raise ValueError("Eplb supports only w8a8_dynamic quantization.")
|
||||
else:
|
||||
eplb_enable = False
|
||||
elif not eplb_enable:
|
||||
_, expert_map, _ = determine_expert_map(ep_size, moe_config.ep_rank, n_experts)
|
||||
return None, expert_map, None, 0
|
||||
|
||||
if global_placement is None:
|
||||
global_placement = generate_global_placement(n_experts, ep_size, n_redundant)
|
||||
|
||||
if ep_size == 1:
|
||||
assert not eplb_enable, "EPLB must used in expert parallelism."
|
||||
return None, None, None, n_redundant
|
||||
global_expert_map = []
|
||||
for rankid in range(ep_size):
|
||||
expert_map = torch.full((n_experts,), -1, dtype=torch.int32)
|
||||
@@ -85,7 +91,7 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
||||
expert_map[local_placement] = torch.arange(local_placement.shape[0], dtype=torch.int32)
|
||||
global_expert_map.append(expert_map)
|
||||
if rankid == moe_config.ep_rank:
|
||||
local_expert_map = expert_map.npu()
|
||||
local_expert_map = expert_map
|
||||
log2phy = generate_log2phy_map(global_expert_map, moe_config.ep_rank).npu() if eplb_enable else None
|
||||
|
||||
return torch.stack(global_expert_map), local_expert_map, log2phy, n_redundant
|
||||
|
||||
@@ -286,9 +286,7 @@ class AscendFusedMoE(FusedMoE):
|
||||
)
|
||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||
self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy is not None)
|
||||
self.local_num_experts = (
|
||||
torch.sum(self._expert_map != -1).item() if self._expert_map is not None else self.global_num_experts
|
||||
)
|
||||
self.local_num_experts = self.global_num_experts // self.ep_size
|
||||
if self._expert_map is not None:
|
||||
logger.info_once(
|
||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||
|
||||
Reference in New Issue
Block a user