diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index ce76d2f2d..88e150e4d 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -119,7 +119,8 @@ class FusedMoE(torch.nn.Module): * self.num_local_experts : (self.moe_ep_rank + 1) * self.num_local_experts ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu") - self.expert_map_gpu = self.expert_map_cpu.to(device="cuda") + if not self.enable_flashinfer_cutlass_moe: + self.expert_map_gpu = self.expert_map_cpu.to(device="cuda") self.routed_scaling_factor = routed_scaling_factor assert intermediate_size % self.moe_tp_size == 0 diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 0a35fa00e..507fb7121 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -437,6 +437,11 @@ class ServerArgs: self.quantization == "modelopt_fp4" ), "modelopt_fp4 quantization is required for Flashinfer MOE" os.environ["TRTLLM_ENABLE_PDL"] = "1" + if self.enable_ep_moe: + self.ep_size = self.tp_size + logger.warning( + f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." + ) if self.enable_flashinfer_trtllm_moe: assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"