[MOE][Bugfix] Cancel H2D for expert_map (#7000)
### What this PR does / why we need it?
If expert_map is on the device, there may be occasional repeated answers
in long output scenarios.
dsv3.2-exp-w8a8
No garbled characters are displayed in the output.
| dataset | version | metric | mode | vllm-api-stream-chat |
|----- | ----- | ----- | ----- | -----|
| aime2025 | ef2f4f | accuracy | gen | 60.00 |
- vLLM version: v0.16.0
- vLLM main:
15d76f74e2
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
This commit is contained in:
@@ -22,6 +22,7 @@ from collections import defaultdict
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
|
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
|
||||||
|
|
||||||
|
|
||||||
def expert_file_to_tensor(expert_map_path, layer_id):
|
def expert_file_to_tensor(expert_map_path, layer_id):
|
||||||
@@ -60,6 +61,11 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
|||||||
global_placement = None
|
global_placement = None
|
||||||
eplb_enable = eplb_config.dynamic_eplb
|
eplb_enable = eplb_config.dynamic_eplb
|
||||||
n_redundant = eplb_config.num_redundant_experts if eplb_enable else 0
|
n_redundant = eplb_config.num_redundant_experts if eplb_enable else 0
|
||||||
|
|
||||||
|
if ep_size == 1:
|
||||||
|
assert not eplb_enable, "EPLB must used in expert parallelism."
|
||||||
|
return None, None, None, n_redundant
|
||||||
|
|
||||||
if expert_map_path:
|
if expert_map_path:
|
||||||
if not (os.path.exists(expert_map_path) and os.access(expert_map_path, os.R_OK)):
|
if not (os.path.exists(expert_map_path) and os.access(expert_map_path, os.R_OK)):
|
||||||
raise ValueError("Invalid EPLB path")
|
raise ValueError("Invalid EPLB path")
|
||||||
@@ -71,13 +77,13 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
|||||||
raise ValueError("Eplb supports only w8a8_dynamic quantization.")
|
raise ValueError("Eplb supports only w8a8_dynamic quantization.")
|
||||||
else:
|
else:
|
||||||
eplb_enable = False
|
eplb_enable = False
|
||||||
|
elif not eplb_enable:
|
||||||
|
_, expert_map, _ = determine_expert_map(ep_size, moe_config.ep_rank, n_experts)
|
||||||
|
return None, expert_map, None, 0
|
||||||
|
|
||||||
if global_placement is None:
|
if global_placement is None:
|
||||||
global_placement = generate_global_placement(n_experts, ep_size, n_redundant)
|
global_placement = generate_global_placement(n_experts, ep_size, n_redundant)
|
||||||
|
|
||||||
if ep_size == 1:
|
|
||||||
assert not eplb_enable, "EPLB must used in expert parallelism."
|
|
||||||
return None, None, None, n_redundant
|
|
||||||
global_expert_map = []
|
global_expert_map = []
|
||||||
for rankid in range(ep_size):
|
for rankid in range(ep_size):
|
||||||
expert_map = torch.full((n_experts,), -1, dtype=torch.int32)
|
expert_map = torch.full((n_experts,), -1, dtype=torch.int32)
|
||||||
@@ -85,7 +91,7 @@ def init_eplb_config(eplb_config, layer_id, moe_config):
|
|||||||
expert_map[local_placement] = torch.arange(local_placement.shape[0], dtype=torch.int32)
|
expert_map[local_placement] = torch.arange(local_placement.shape[0], dtype=torch.int32)
|
||||||
global_expert_map.append(expert_map)
|
global_expert_map.append(expert_map)
|
||||||
if rankid == moe_config.ep_rank:
|
if rankid == moe_config.ep_rank:
|
||||||
local_expert_map = expert_map.npu()
|
local_expert_map = expert_map
|
||||||
log2phy = generate_log2phy_map(global_expert_map, moe_config.ep_rank).npu() if eplb_enable else None
|
log2phy = generate_log2phy_map(global_expert_map, moe_config.ep_rank).npu() if eplb_enable else None
|
||||||
|
|
||||||
return torch.stack(global_expert_map), local_expert_map, log2phy, n_redundant
|
return torch.stack(global_expert_map), local_expert_map, log2phy, n_redundant
|
||||||
|
|||||||
@@ -286,9 +286,7 @@ class AscendFusedMoE(FusedMoE):
|
|||||||
)
|
)
|
||||||
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
||||||
self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy is not None)
|
self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy is not None)
|
||||||
self.local_num_experts = (
|
self.local_num_experts = self.global_num_experts // self.ep_size
|
||||||
torch.sum(self._expert_map != -1).item() if self._expert_map is not None else self.global_num_experts
|
|
||||||
)
|
|
||||||
if self._expert_map is not None:
|
if self._expert_map is not None:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
||||||
|
|||||||
Reference in New Issue
Block a user