[MOE][Bugfix] Cancel H2D for expert_map (#7000)

### What this PR does / why we need it?
If expert_map is on the device, there may be occasional repeated answers
in long output scenarios.

dsv3.2-exp-w8a8
No garbled characters are displayed in the output.
| dataset | version | metric | mode | vllm-api-stream-chat |
|----- | ----- | ----- | ----- | -----|
| aime2025 | ef2f4f | accuracy | gen | 60.00 |

- vLLM version: v0.16.0
- vLLM main:
15d76f74e2

Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
LI SHENGYONG
2026-03-09 17:53:54 +08:00
committed by GitHub
parent 82fdd40d49
commit a76a509fae
2 changed files with 11 additions and 7 deletions

View File

@@ -22,6 +22,7 @@ from collections import defaultdict
import numpy as np import numpy as np
import torch import torch
from vllm.logger import logger from vllm.logger import logger
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
def expert_file_to_tensor(expert_map_path, layer_id): def expert_file_to_tensor(expert_map_path, layer_id):
@@ -60,6 +61,11 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
global_placement = None global_placement = None
eplb_enable = eplb_config.dynamic_eplb eplb_enable = eplb_config.dynamic_eplb
n_redundant = eplb_config.num_redundant_experts if eplb_enable else 0 n_redundant = eplb_config.num_redundant_experts if eplb_enable else 0
if ep_size == 1:
assert not eplb_enable, "EPLB must used in expert parallelism."
return None, None, None, n_redundant
if expert_map_path: if expert_map_path:
if not (os.path.exists(expert_map_path) and os.access(expert_map_path, os.R_OK)): if not (os.path.exists(expert_map_path) and os.access(expert_map_path, os.R_OK)):
raise ValueError("Invalid EPLB path") raise ValueError("Invalid EPLB path")
@@ -71,13 +77,13 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
raise ValueError("Eplb supports only w8a8_dynamic quantization.") raise ValueError("Eplb supports only w8a8_dynamic quantization.")
else: else:
eplb_enable = False eplb_enable = False
elif not eplb_enable:
_, expert_map, _ = determine_expert_map(ep_size, moe_config.ep_rank, n_experts)
return None, expert_map, None, 0
if global_placement is None: if global_placement is None:
global_placement = generate_global_placement(n_experts, ep_size, n_redundant) global_placement = generate_global_placement(n_experts, ep_size, n_redundant)
if ep_size == 1:
assert not eplb_enable, "EPLB must used in expert parallelism."
return None, None, None, n_redundant
global_expert_map = [] global_expert_map = []
for rankid in range(ep_size): for rankid in range(ep_size):
expert_map = torch.full((n_experts,), -1, dtype=torch.int32) expert_map = torch.full((n_experts,), -1, dtype=torch.int32)
@@ -85,7 +91,7 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
expert_map[local_placement] = torch.arange(local_placement.shape[0], dtype=torch.int32) expert_map[local_placement] = torch.arange(local_placement.shape[0], dtype=torch.int32)
global_expert_map.append(expert_map) global_expert_map.append(expert_map)
if rankid == moe_config.ep_rank: if rankid == moe_config.ep_rank:
local_expert_map = expert_map.npu() local_expert_map = expert_map
log2phy = generate_log2phy_map(global_expert_map, moe_config.ep_rank).npu() if eplb_enable else None log2phy = generate_log2phy_map(global_expert_map, moe_config.ep_rank).npu() if eplb_enable else None
return torch.stack(global_expert_map), local_expert_map, log2phy, n_redundant return torch.stack(global_expert_map), local_expert_map, log2phy, n_redundant

View File

@@ -286,9 +286,7 @@ class AscendFusedMoE(FusedMoE):
) )
self.global_num_experts = num_experts + self.global_redundant_expert_num self.global_num_experts = num_experts + self.global_redundant_expert_num
self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy is not None) self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy is not None)
self.local_num_experts = ( self.local_num_experts = self.global_num_experts // self.ep_size
torch.sum(self._expert_map != -1).item() if self._expert_map is not None else self.global_num_experts
)
if self._expert_map is not None: if self._expert_map is not None:
logger.info_once( logger.info_once(
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global" "[EP Rank %s/%s] Expert parallelism is enabled. Local/global"