[MOE][Bugfix] Cancel H2D for expert_map (#7000)
### What this PR does / why we need it?
If expert_map is on the device, there may be occasional repeated answers
in long output scenarios.
dsv3.2-exp-w8a8
No garbled characters are displayed in the output.
| dataset | version | metric | mode | vllm-api-stream-chat |
|----- | ----- | ----- | ----- | -----|
| aime2025 | ef2f4f | accuracy | gen | 60.00 |
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -22,6 +22,7 @@ from collections import defaultdict
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
|
||||
|
||||
|
||||
def expert_file_to_tensor(expert_map_path, layer_id):
|
||||
@@ -60,6 +61,11 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
||||
global_placement = None
|
||||
eplb_enable = eplb_config.dynamic_eplb
|
||||
n_redundant = eplb_config.num_redundant_experts if eplb_enable else 0
|
||||
|
||||
if ep_size == 1:
|
||||
assert not eplb_enable, "EPLB must used in expert parallelism."
|
||||
return None, None, None, n_redundant
|
||||
|
||||
if expert_map_path:
|
||||
if not (os.path.exists(expert_map_path) and os.access(expert_map_path, os.R_OK)):
|
||||
raise ValueError("Invalid EPLB path")
|
||||
@@ -71,13 +77,13 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
||||
raise ValueError("Eplb supports only w8a8_dynamic quantization.")
|
||||
else:
|
||||
eplb_enable = False
|
||||
elif not eplb_enable:
|
||||
_, expert_map, _ = determine_expert_map(ep_size, moe_config.ep_rank, n_experts)
|
||||
return None, expert_map, None, 0
|
||||
|
||||
if global_placement is None:
|
||||
global_placement = generate_global_placement(n_experts, ep_size, n_redundant)
|
||||
|
||||
if ep_size == 1:
|
||||
assert not eplb_enable, "EPLB must used in expert parallelism."
|
||||
return None, None, None, n_redundant
|
||||
global_expert_map = []
|
||||
for rankid in range(ep_size):
|
||||
expert_map = torch.full((n_experts,), -1, dtype=torch.int32)
|
||||
@@ -85,7 +91,7 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
||||
expert_map[local_placement] = torch.arange(local_placement.shape[0], dtype=torch.int32)
|
||||
global_expert_map.append(expert_map)
|
||||
if rankid == moe_config.ep_rank:
|
||||
local_expert_map = expert_map.npu()
|
||||
local_expert_map = expert_map
|
||||
log2phy = generate_log2phy_map(global_expert_map, moe_config.ep_rank).npu() if eplb_enable else None
|
||||
|
||||
return torch.stack(global_expert_map), local_expert_map, log2phy, n_redundant
|
||||
|
||||
Reference in New Issue
Block a user