diff --git a/docs/backend/server_arguments.md b/docs/backend/server_arguments.md index f742083f1..985596292 100644 --- a/docs/backend/server_arguments.md +++ b/docs/backend/server_arguments.md @@ -214,7 +214,8 @@ Please consult the documentation below and [server_args.py](https://github.com/s | `--ep-size` | The expert parallelism size. | 1 | | `--enable-ep-moe` | Enabling expert parallelism for moe. The ep size is equal to the tp size. | False | | `--enable-deepep-moe` | Enabling DeepEP MoE implementation for EP MoE. | False | -| `--enable-flashinfer-moe` | Enabling Flashinfer MoE implementation. | False | +| `--enable-flashinfer-cutlass-moe` | Enabling Flashinfer Cutlass MoE implementation for high throughput. | False | +| `--enable-flashinfer-trtllm-moe` | Enabling Flashinfer Trtllm MoE implementation for low latency. | False | | `--deepep-mode` | Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch. | auto | | `--ep-num-redundant-experts` | Allocate this number of redundant experts in expert parallel. | 0 | | `--ep-dispatch-algorithm` | The algorithm to choose ranks for redundant experts in expert parallel. | None | diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py index 6aa83dc00..b978eaf3a 100644 --- a/python/sglang/srt/layers/moe/ep_moe/layer.py +++ b/python/sglang/srt/layers/moe/ep_moe/layer.py @@ -1268,7 +1268,7 @@ class FlashInferEPMoE(EPMoE): topk_group=self.topk_group, intermediate_size=self.w2_weight.shape[2], local_expert_offset=self.start_expert_id, - local_num_experts=self.num_experts_per_partition, + local_num_experts=self.num_local_experts, routed_scaling_factor=self.routed_scaling_factor, tile_tokens_dim=_get_tile_tokens_dim( hidden_states.shape[0], self.top_k, self.num_experts