[bugfix] Fix flashinfer cutlass EP moe after MoE refactor (#8630)
This commit is contained in:
@@ -119,7 +119,8 @@ class FusedMoE(torch.nn.Module):
|
||||
* self.num_local_experts : (self.moe_ep_rank + 1)
|
||||
* self.num_local_experts
|
||||
] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
|
||||
self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
|
||||
if not self.enable_flashinfer_cutlass_moe:
|
||||
self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
|
||||
|
||||
self.routed_scaling_factor = routed_scaling_factor
|
||||
assert intermediate_size % self.moe_tp_size == 0
|
||||
|
||||
@@ -437,6 +437,11 @@ class ServerArgs:
|
||||
self.quantization == "modelopt_fp4"
|
||||
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
||||
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
||||
if self.enable_ep_moe:
|
||||
self.ep_size = self.tp_size
|
||||
logger.warning(
|
||||
f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
||||
)
|
||||
|
||||
if self.enable_flashinfer_trtllm_moe:
|
||||
assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
|
||||
|
||||
Reference in New Issue
Block a user