[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)
This commit is contained in:
@@ -23,7 +23,7 @@ spec:
|
|||||||
- /bin/bash
|
- /bin/bash
|
||||||
- -c
|
- -c
|
||||||
# please modify the sglang serving arguments below, as necessary.
|
# please modify the sglang serving arguments below, as necessary.
|
||||||
# NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
|
# NOTE: the --expert-parallel-size is for MoE model like DeepSeek-R1
|
||||||
args:
|
args:
|
||||||
- |
|
- |
|
||||||
python3 -m sglang.launch_server \
|
python3 -m sglang.launch_server \
|
||||||
@@ -36,7 +36,6 @@ spec:
|
|||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--enable-metrics \
|
--enable-metrics \
|
||||||
--enable-ep-moe \
|
|
||||||
--expert-parallel-size 16
|
--expert-parallel-size 16
|
||||||
env:
|
env:
|
||||||
- name: POD_INDEX # reflects the node-rank
|
- name: POD_INDEX # reflects the node-rank
|
||||||
|
|||||||
@@ -39,13 +39,13 @@ $ python -m sglang.srt.disaggregation.mini_lb --prefill http://127.0.0.1:30000 -
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# prefill 0
|
# prefill 0
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode normal --mem-fraction-static 0.8
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||||
# prefill 1
|
# prefill 1
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode normal --mem-fraction-static 0.8
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||||
# decode 0
|
# decode 0
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode low_latency --mem-fraction-static 0.8 --max-running-requests 128
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||||
# decode 1
|
# decode 1
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode low_latency --mem-fraction-static 0.8 --max-running-requests 128
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||||
```
|
```
|
||||||
### Advanced Configuration
|
### Advanced Configuration
|
||||||
|
|
||||||
@@ -103,13 +103,13 @@ $ python -m sglang.srt.disaggregation.mini_lb --prefill http://127.0.0.1:30000 -
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# prefill 0
|
# prefill 0
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode normal --mem-fraction-static 0.8
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||||
# prefill 1
|
# prefill 1
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode normal --mem-fraction-static 0.8
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
|
||||||
# decode 0
|
# decode 0
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode low_latency --mem-fraction-static 0.8 --max-running-requests 128
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||||
# decode 1
|
# decode 1
|
||||||
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --enable-deepep-moe --deepep-mode low_latency --mem-fraction-static 0.8 --max-running-requests 128
|
$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 ---disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
|
||||||
```
|
```
|
||||||
|
|
||||||
## ASCEND
|
## ASCEND
|
||||||
|
|||||||
@@ -212,8 +212,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
|
|||||||
| Arguments | Description | Defaults |
|
| Arguments | Description | Defaults |
|
||||||
|-----------|-------------|----------|
|
|-----------|-------------|----------|
|
||||||
| `--ep-size` | The expert parallelism size. | 1 |
|
| `--ep-size` | The expert parallelism size. | 1 |
|
||||||
| `--enable-ep-moe` | Enabling expert parallelism for moe. The ep size is equal to the tp size. | False |
|
| `--moe-a2a-backend` | Select the backend for all-to-all communication for expert parallelism. | None |
|
||||||
| `--enable-deepep-moe` | Enabling DeepEP MoE implementation for EP MoE. | False |
|
|
||||||
| `--enable-flashinfer-cutlass-moe` | Enabling Flashinfer Cutlass MoE implementation for high throughput. | False |
|
| `--enable-flashinfer-cutlass-moe` | Enabling Flashinfer Cutlass MoE implementation for high throughput. | False |
|
||||||
| `--enable-flashinfer-trtllm-moe` | Enabling Flashinfer Trtllm MoE implementation for low latency. | False |
|
| `--enable-flashinfer-trtllm-moe` | Enabling Flashinfer Trtllm MoE implementation for low latency. | False |
|
||||||
| `--deepep-mode` | Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch. | auto |
|
| `--deepep-mode` | Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch. | auto |
|
||||||
|
|||||||
@@ -28,9 +28,8 @@ spec:
|
|||||||
- --enable-dp-lm-head
|
- --enable-dp-lm-head
|
||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- low_latency
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- decode
|
- decode
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
@@ -166,9 +165,8 @@ spec:
|
|||||||
- --enable-dp-lm-head
|
- --enable-dp-lm-head
|
||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- low_latency
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- decode
|
- decode
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
|
|||||||
@@ -38,9 +38,8 @@ spec:
|
|||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --disable-radix-cache
|
- --disable-radix-cache
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- normal
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- prefill
|
- prefill
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
@@ -184,9 +183,8 @@ spec:
|
|||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --disable-radix-cache
|
- --disable-radix-cache
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- normal
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- prefill
|
- prefill
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
|
|||||||
@@ -64,9 +64,8 @@ spec:
|
|||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --disable-radix-cache
|
- --disable-radix-cache
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- normal
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- prefill
|
- prefill
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
@@ -212,9 +211,8 @@ spec:
|
|||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --disable-radix-cache
|
- --disable-radix-cache
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- normal
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- prefill
|
- prefill
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
@@ -373,9 +371,8 @@ spec:
|
|||||||
- --enable-dp-lm-head
|
- --enable-dp-lm-head
|
||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- low_latency
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- decode
|
- decode
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
@@ -508,9 +505,8 @@ spec:
|
|||||||
#- --enable-two-batch-overlap
|
#- --enable-two-batch-overlap
|
||||||
- --dp-size
|
- --dp-size
|
||||||
- "16"
|
- "16"
|
||||||
- --enable-deepep-moe
|
- --moe-a2a-backend
|
||||||
- --deepep-mode
|
- deepep
|
||||||
- low_latency
|
|
||||||
- --disaggregation-mode
|
- --disaggregation-mode
|
||||||
- decode
|
- decode
|
||||||
- --mem-fraction-static
|
- --mem-fraction-static
|
||||||
|
|||||||
@@ -288,12 +288,14 @@ class _SinglePassGatherer(ABC):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if server_args.expert_distribution_recorder_mode == "stat_approx":
|
if server_args.expert_distribution_recorder_mode == "stat_approx":
|
||||||
if server_args.enable_deepep_moe and (server_args.deepep_mode == "normal"):
|
if server_args.moe_a2a_backend is not None and (
|
||||||
|
server_args.deepep_mode == "normal"
|
||||||
|
):
|
||||||
return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
|
return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
if server_args.enable_deepep_moe:
|
if server_args.moe_a2a_backend is not None:
|
||||||
if server_args.deepep_mode == "normal":
|
if server_args.deepep_mode == "normal":
|
||||||
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
||||||
elif server_args.deepep_mode == "low_latency":
|
elif server_args.deepep_mode == "low_latency":
|
||||||
|
|||||||
@@ -108,7 +108,7 @@ class LayerScatterModes:
|
|||||||
if context.is_layer_sparse:
|
if context.is_layer_sparse:
|
||||||
return (
|
return (
|
||||||
ScatterMode.SCATTERED
|
ScatterMode.SCATTERED
|
||||||
if global_server_args_dict["enable_deepep_moe"]
|
if not global_server_args_dict["moe_a2a_backend"].is_standard()
|
||||||
else ScatterMode.FULL
|
else ScatterMode.FULL
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -1,28 +1,17 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import TYPE_CHECKING, List, Optional, Tuple
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from sglang.srt.distributed import (
|
from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
|
||||||
get_tensor_model_parallel_rank,
|
|
||||||
get_tensor_model_parallel_world_size,
|
|
||||||
)
|
|
||||||
from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
|
|
||||||
from sglang.srt.layers.moe.ep_moe.kernels import (
|
from sglang.srt.layers.moe.ep_moe.kernels import (
|
||||||
ep_gather,
|
ep_gather,
|
||||||
ep_scatter,
|
ep_scatter,
|
||||||
gelu_and_mul_triton_kernel,
|
|
||||||
grouped_gemm_triton,
|
|
||||||
moe_ep_deepgemm_preprocess,
|
moe_ep_deepgemm_preprocess,
|
||||||
post_reorder_triton_kernel,
|
post_reorder_triton_kernel,
|
||||||
pre_reorder_triton_kernel,
|
|
||||||
pre_reorder_triton_kernel_for_cutlass_moe,
|
|
||||||
run_cutlass_moe_ep_preproess,
|
|
||||||
run_moe_ep_preproess,
|
|
||||||
silu_and_mul_masked_post_quant_fwd,
|
silu_and_mul_masked_post_quant_fwd,
|
||||||
silu_and_mul_triton_kernel,
|
|
||||||
tma_align_input_scale,
|
tma_align_input_scale,
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.moe.fused_moe_triton.layer import (
|
from sglang.srt.layers.moe.fused_moe_triton.layer import (
|
||||||
@@ -31,11 +20,9 @@ from sglang.srt.layers.moe.fused_moe_triton.layer import (
|
|||||||
should_use_flashinfer_trtllm_moe,
|
should_use_flashinfer_trtllm_moe,
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.moe.topk import TopKOutput
|
from sglang.srt.layers.moe.topk import TopKOutput
|
||||||
|
from sglang.srt.layers.moe.utils import DeepEPMode
|
||||||
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
||||||
from sglang.srt.layers.quantization.base_config import (
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||||
QuantizationConfig,
|
|
||||||
QuantizeMethodBase,
|
|
||||||
)
|
|
||||||
from sglang.srt.layers.quantization.fp8 import (
|
from sglang.srt.layers.quantization.fp8 import (
|
||||||
Fp8Config,
|
Fp8Config,
|
||||||
Fp8MoEMethod,
|
Fp8MoEMethod,
|
||||||
@@ -44,23 +31,13 @@ from sglang.srt.layers.quantization.fp8 import (
|
|||||||
from sglang.srt.layers.quantization.fp8_kernel import (
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
||||||
is_fp8_fnuz,
|
is_fp8_fnuz,
|
||||||
sglang_per_token_group_quant_fp8,
|
sglang_per_token_group_quant_fp8,
|
||||||
sglang_per_token_quant_fp8,
|
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
|
|
||||||
from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
|
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import ceil_div, dispose_tensor, get_bool_env_var, is_hip, is_npu
|
||||||
DeepEPMode,
|
|
||||||
ceil_div,
|
|
||||||
dispose_tensor,
|
|
||||||
get_bool_env_var,
|
|
||||||
is_hip,
|
|
||||||
is_npu,
|
|
||||||
)
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from sglang.srt.layers.moe.ep_moe.token_dispatcher import (
|
from sglang.srt.layers.moe.token_dispatcher import (
|
||||||
DeepEPLLOutput,
|
DeepEPLLOutput,
|
||||||
DeepEPNormalOutput,
|
DeepEPNormalOutput,
|
||||||
DispatchOutput,
|
DispatchOutput,
|
||||||
@@ -119,7 +96,6 @@ class EPMoE(FusedMoE):
|
|||||||
activation=activation,
|
activation=activation,
|
||||||
# apply_router_weight_on_input=apply_router_weight_on_input,
|
# apply_router_weight_on_input=apply_router_weight_on_input,
|
||||||
routed_scaling_factor=routed_scaling_factor,
|
routed_scaling_factor=routed_scaling_factor,
|
||||||
enable_ep_moe=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.start_expert_id = self.moe_ep_rank * self.num_local_experts
|
self.start_expert_id = self.moe_ep_rank * self.num_local_experts
|
||||||
@@ -328,7 +304,7 @@ class DeepEPMoE(EPMoE):
|
|||||||
prefix: str = "",
|
prefix: str = "",
|
||||||
activation: str = "silu",
|
activation: str = "silu",
|
||||||
routed_scaling_factor: Optional[float] = None,
|
routed_scaling_factor: Optional[float] = None,
|
||||||
deepep_mode: DeepEPMode = DeepEPMode.auto,
|
deepep_mode: DeepEPMode = DeepEPMode.AUTO,
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
num_experts=num_experts,
|
num_experts=num_experts,
|
||||||
@@ -348,7 +324,6 @@ class DeepEPMoE(EPMoE):
|
|||||||
|
|
||||||
# TODO: move to the beginning of the file
|
# TODO: move to the beginning of the file
|
||||||
from sglang.srt.distributed.parallel_state import get_tp_group
|
from sglang.srt.distributed.parallel_state import get_tp_group
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
||||||
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
|
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
|
||||||
|
|
||||||
self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
|
self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
|
||||||
@@ -762,11 +737,10 @@ class FlashInferEPMoE(EPMoE):
|
|||||||
|
|
||||||
|
|
||||||
def get_moe_impl_class():
|
def get_moe_impl_class():
|
||||||
if global_server_args_dict["enable_deepep_moe"]:
|
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
||||||
return DeepEPMoE
|
return DeepEPMoE
|
||||||
if global_server_args_dict["enable_flashinfer_cutlass_moe"]:
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]:
|
||||||
# Must come before EPMoE because FusedMoE also supports enable_ep_moe
|
|
||||||
return FusedMoE
|
return FusedMoE
|
||||||
if global_server_args_dict["enable_ep_moe"]:
|
if get_moe_expert_parallel_world_size() > 1:
|
||||||
return FlashInferEPMoE if should_use_flashinfer_trtllm_moe() else EPMoE
|
return FlashInferEPMoE if should_use_flashinfer_trtllm_moe() else EPMoE
|
||||||
return FlashInferFusedMoE if should_use_flashinfer_trtllm_moe() else FusedMoE
|
return FlashInferFusedMoE if should_use_flashinfer_trtllm_moe() else FusedMoE
|
||||||
|
|||||||
@@ -14,8 +14,6 @@ from sglang.srt.distributed import (
|
|||||||
get_moe_expert_parallel_world_size,
|
get_moe_expert_parallel_world_size,
|
||||||
get_moe_tensor_parallel_rank,
|
get_moe_tensor_parallel_rank,
|
||||||
get_moe_tensor_parallel_world_size,
|
get_moe_tensor_parallel_world_size,
|
||||||
get_tensor_model_parallel_rank,
|
|
||||||
get_tensor_model_parallel_world_size,
|
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
)
|
)
|
||||||
from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
|
from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
|
||||||
@@ -94,7 +92,6 @@ class FusedMoE(torch.nn.Module):
|
|||||||
no_combine: bool = False,
|
no_combine: bool = False,
|
||||||
routed_scaling_factor: Optional[float] = None,
|
routed_scaling_factor: Optional[float] = None,
|
||||||
enable_flashinfer_cutlass_moe: Optional[bool] = False,
|
enable_flashinfer_cutlass_moe: Optional[bool] = False,
|
||||||
enable_ep_moe: Optional[bool] = False,
|
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@@ -112,7 +109,6 @@ class FusedMoE(torch.nn.Module):
|
|||||||
if enable_flashinfer_cutlass_moe and quant_config is None:
|
if enable_flashinfer_cutlass_moe and quant_config is None:
|
||||||
logger.warning("Disable flashinfer MoE when quantization config is None.")
|
logger.warning("Disable flashinfer MoE when quantization config is None.")
|
||||||
enable_flashinfer_cutlass_moe = False
|
enable_flashinfer_cutlass_moe = False
|
||||||
enable_ep_moe = False
|
|
||||||
|
|
||||||
self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
|
self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
|
||||||
self.moe_ep_size = get_moe_expert_parallel_world_size()
|
self.moe_ep_size = get_moe_expert_parallel_world_size()
|
||||||
@@ -121,7 +117,7 @@ class FusedMoE(torch.nn.Module):
|
|||||||
self.moe_tp_rank = get_moe_tensor_parallel_rank()
|
self.moe_tp_rank = get_moe_tensor_parallel_rank()
|
||||||
assert num_experts % self.moe_ep_size == 0
|
assert num_experts % self.moe_ep_size == 0
|
||||||
self.num_local_experts = num_experts // self.moe_ep_size
|
self.num_local_experts = num_experts // self.moe_ep_size
|
||||||
if enable_ep_moe:
|
if self.moe_ep_size > 1:
|
||||||
# TODO(ch-wan): support shared experts fusion
|
# TODO(ch-wan): support shared experts fusion
|
||||||
# Create a tensor of size num_experts filled with -1
|
# Create a tensor of size num_experts filled with -1
|
||||||
self.expert_map_cpu = torch.full((self.num_experts,), -1, dtype=torch.int32)
|
self.expert_map_cpu = torch.full((self.num_experts,), -1, dtype=torch.int32)
|
||||||
|
|||||||
@@ -0,0 +1,23 @@
|
|||||||
|
from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
|
||||||
|
BaseDispatcher,
|
||||||
|
BaseDispatcherConfig,
|
||||||
|
DispatchOutput,
|
||||||
|
DispatchOutputFormat,
|
||||||
|
)
|
||||||
|
from sglang.srt.layers.moe.token_dispatcher.deepep import (
|
||||||
|
DeepEPConfig,
|
||||||
|
DeepEPDispatcher,
|
||||||
|
DeepEPLLOutput,
|
||||||
|
DeepEPNormalOutput,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BaseDispatcher",
|
||||||
|
"BaseDispatcherConfig",
|
||||||
|
"DispatchOutput",
|
||||||
|
"DispatchOutputFormat",
|
||||||
|
"DeepEPConfig",
|
||||||
|
"DeepEPDispatcher",
|
||||||
|
"DeepEPNormalOutput",
|
||||||
|
"DeepEPLLOutput",
|
||||||
|
]
|
||||||
|
|||||||
@@ -2,11 +2,22 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable
|
from typing import Protocol, runtime_checkable
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class MoEA2ABackend(Enum):
|
||||||
|
none = "none"
|
||||||
|
deepep = "deepep"
|
||||||
|
|
||||||
|
def is_none(self):
|
||||||
|
return self == MoEA2ABackend.none
|
||||||
|
|
||||||
|
def is_deepep(self):
|
||||||
|
return self == MoEA2ABackend.deepep
|
||||||
|
|
||||||
|
|
||||||
class DispatchOutputFormat(Enum):
|
class DispatchOutputFormat(Enum):
|
||||||
standard = auto()
|
standard = auto()
|
||||||
deepep_normal = auto()
|
deepep_normal = auto()
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
# TODO(ch-wan): this file will be moved to sglang/srt/layers/moe/token_dispatcher/deepep.py
|
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -22,15 +20,10 @@ from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
|
|||||||
DispatchOutput,
|
DispatchOutput,
|
||||||
DispatchOutputFormat,
|
DispatchOutputFormat,
|
||||||
)
|
)
|
||||||
|
from sglang.srt.layers.moe.utils import DeepEPMode
|
||||||
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
||||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import get_bool_env_var, get_int_env_var, is_hip, load_json_config
|
||||||
DeepEPMode,
|
|
||||||
get_bool_env_var,
|
|
||||||
get_int_env_var,
|
|
||||||
is_hip,
|
|
||||||
load_json_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from deep_ep import Buffer, Config
|
from deep_ep import Buffer, Config
|
||||||
@@ -150,9 +143,9 @@ class DeepEPBuffer:
|
|||||||
num_rdma_bytes,
|
num_rdma_bytes,
|
||||||
)
|
)
|
||||||
|
|
||||||
if deepep_mode == DeepEPMode.normal:
|
if deepep_mode == DeepEPMode.NORMAL:
|
||||||
num_qps_per_rank = DeepEPConfig.get_instance().num_sms // 2
|
num_qps_per_rank = DeepEPConfig.get_instance().num_sms // 2
|
||||||
elif deepep_mode in [DeepEPMode.low_latency, DeepEPMode.auto]:
|
elif deepep_mode in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]:
|
||||||
num_qps_per_rank = num_experts // group.size()
|
num_qps_per_rank = num_experts // group.size()
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
@@ -161,7 +154,7 @@ class DeepEPBuffer:
|
|||||||
device="cuda"
|
device="cuda"
|
||||||
).multi_processor_count
|
).multi_processor_count
|
||||||
if (
|
if (
|
||||||
(deepep_mode != DeepEPMode.low_latency)
|
(deepep_mode != DeepEPMode.LOW_LATENCY)
|
||||||
and not global_server_args_dict["enable_two_batch_overlap"]
|
and not global_server_args_dict["enable_two_batch_overlap"]
|
||||||
and (DeepEPConfig.get_instance().num_sms < total_num_sms // 2)
|
and (DeepEPConfig.get_instance().num_sms < total_num_sms // 2)
|
||||||
):
|
):
|
||||||
@@ -611,7 +604,7 @@ class DeepEPDispatcher(BaseDispatcher):
|
|||||||
num_local_experts: int = None,
|
num_local_experts: int = None,
|
||||||
hidden_size: int = None,
|
hidden_size: int = None,
|
||||||
params_dtype: torch.dtype = None,
|
params_dtype: torch.dtype = None,
|
||||||
deepep_mode: DeepEPMode = DeepEPMode.auto,
|
deepep_mode: DeepEPMode = DeepEPMode.AUTO,
|
||||||
async_finish: bool = False,
|
async_finish: bool = False,
|
||||||
return_recv_hook: bool = False,
|
return_recv_hook: bool = False,
|
||||||
):
|
):
|
||||||
@@ -697,9 +690,9 @@ class DeepEPDispatcher(BaseDispatcher):
|
|||||||
resolved_deepep_mode = self.deepep_mode.resolve(
|
resolved_deepep_mode = self.deepep_mode.resolve(
|
||||||
forward_batch.is_extend_in_batch
|
forward_batch.is_extend_in_batch
|
||||||
)
|
)
|
||||||
if resolved_deepep_mode == DeepEPMode.normal:
|
if resolved_deepep_mode == DeepEPMode.NORMAL:
|
||||||
return self._normal_dispatcher
|
return self._normal_dispatcher
|
||||||
elif resolved_deepep_mode == DeepEPMode.low_latency:
|
elif resolved_deepep_mode == DeepEPMode.LOW_LATENCY:
|
||||||
return self._low_latency_dispatcher
|
return self._low_latency_dispatcher
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")
|
raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")
|
||||||
43
python/sglang/srt/layers/moe/utils.py
Normal file
43
python/sglang/srt/layers/moe/utils.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class MoeA2ABackend(Enum):
|
||||||
|
|
||||||
|
STANDARD = ("standard", "none")
|
||||||
|
DEEPEP = "deepep"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _missing_(cls, value):
|
||||||
|
if value is None:
|
||||||
|
return cls.STANDARD
|
||||||
|
for member in cls:
|
||||||
|
if value in member.value:
|
||||||
|
return member
|
||||||
|
raise ValueError(f"No {cls.__name__} member for value {value}")
|
||||||
|
|
||||||
|
def is_deepep(self):
|
||||||
|
return self == MoeA2ABackend.DEEPEP
|
||||||
|
|
||||||
|
def is_standard(self):
|
||||||
|
return self == MoeA2ABackend.STANDARD
|
||||||
|
|
||||||
|
|
||||||
|
class DeepEPMode(Enum):
|
||||||
|
NORMAL = "normal"
|
||||||
|
LOW_LATENCY = "low_latency"
|
||||||
|
AUTO = "auto"
|
||||||
|
|
||||||
|
def enable_normal(self):
|
||||||
|
return self in [DeepEPMode.NORMAL, DeepEPMode.AUTO]
|
||||||
|
|
||||||
|
def enable_low_latency(self):
|
||||||
|
return self in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]
|
||||||
|
|
||||||
|
def resolve(self, is_extend_in_batch: bool):
|
||||||
|
if self != DeepEPMode.AUTO:
|
||||||
|
return self
|
||||||
|
|
||||||
|
if is_extend_in_batch:
|
||||||
|
return DeepEPMode.NORMAL
|
||||||
|
else:
|
||||||
|
return DeepEPMode.LOW_LATENCY
|
||||||
@@ -51,6 +51,7 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
|
|||||||
ScheduleBatchDisaggregationDecodeMixin,
|
ScheduleBatchDisaggregationDecodeMixin,
|
||||||
)
|
)
|
||||||
from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
|
from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
|
||||||
|
from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
|
||||||
from sglang.srt.mem_cache.allocator import (
|
from sglang.srt.mem_cache.allocator import (
|
||||||
BaseTokenToKVPoolAllocator,
|
BaseTokenToKVPoolAllocator,
|
||||||
SWATokenToKVPoolAllocator,
|
SWATokenToKVPoolAllocator,
|
||||||
@@ -85,9 +86,8 @@ GLOBAL_SERVER_ARGS_KEYS = [
|
|||||||
"enable_dp_attention",
|
"enable_dp_attention",
|
||||||
"enable_two_batch_overlap",
|
"enable_two_batch_overlap",
|
||||||
"enable_dp_lm_head",
|
"enable_dp_lm_head",
|
||||||
"enable_deepep_moe",
|
"moe_a2a_backend",
|
||||||
"deepep_mode",
|
"deepep_mode",
|
||||||
"enable_ep_moe",
|
|
||||||
"enable_flashinfer_cutlass_moe",
|
"enable_flashinfer_cutlass_moe",
|
||||||
"enable_flashinfer_trtllm_moe",
|
"enable_flashinfer_trtllm_moe",
|
||||||
"enable_flashinfer_allreduce_fusion",
|
"enable_flashinfer_allreduce_fusion",
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ from sglang.srt.hf_transformers_utils import (
|
|||||||
)
|
)
|
||||||
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
|
||||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||||
|
from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
|
||||||
from sglang.srt.managers.io_struct import (
|
from sglang.srt.managers.io_struct import (
|
||||||
AbortReq,
|
AbortReq,
|
||||||
CloseSessionReqInput,
|
CloseSessionReqInput,
|
||||||
@@ -137,7 +138,6 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
|||||||
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
||||||
from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
|
from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import (
|
||||||
DeepEPMode,
|
|
||||||
DynamicGradMode,
|
DynamicGradMode,
|
||||||
broadcast_pyobj,
|
broadcast_pyobj,
|
||||||
configure_gc_logger,
|
configure_gc_logger,
|
||||||
@@ -1762,8 +1762,10 @@ class Scheduler(
|
|||||||
spec_algorithm=self.spec_algorithm,
|
spec_algorithm=self.spec_algorithm,
|
||||||
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
|
||||||
enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
|
enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
|
||||||
enable_deepep_moe=self.server_args.enable_deepep_moe,
|
enable_deepep_moe=MoeA2ABackend(
|
||||||
deepep_mode=DeepEPMode[self.server_args.deepep_mode],
|
self.server_args.moe_a2a_backend
|
||||||
|
).is_deepep(),
|
||||||
|
deepep_mode=DeepEPMode(self.server_args.deepep_mode),
|
||||||
require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
|
require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
|
||||||
disable_overlap_schedule=self.server_args.disable_overlap_schedule,
|
disable_overlap_schedule=self.server_args.disable_overlap_schedule,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ import torch
|
|||||||
import triton
|
import triton
|
||||||
import triton.language as tl
|
import triton.language as tl
|
||||||
|
|
||||||
|
from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
|
||||||
from sglang.srt.layers.dp_attention import (
|
from sglang.srt.layers.dp_attention import (
|
||||||
DPPaddingMode,
|
DPPaddingMode,
|
||||||
get_attention_dp_rank,
|
get_attention_dp_rank,
|
||||||
@@ -839,7 +840,7 @@ class ForwardBatch:
|
|||||||
|
|
||||||
|
|
||||||
def enable_num_token_non_padded(server_args):
|
def enable_num_token_non_padded(server_args):
|
||||||
return server_args.enable_ep_moe or server_args.enable_deepep_moe
|
return get_moe_expert_parallel_world_size() > 1
|
||||||
|
|
||||||
|
|
||||||
class PPProxyTensors:
|
class PPProxyTensors:
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ from sglang.srt.layers.dp_attention import (
|
|||||||
initialize_dp_attention,
|
initialize_dp_attention,
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
||||||
|
from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
|
||||||
from sglang.srt.layers.quantization import (
|
from sglang.srt.layers.quantization import (
|
||||||
deep_gemm_wrapper,
|
deep_gemm_wrapper,
|
||||||
monkey_patch_isinstance_for_vllm_base_layer,
|
monkey_patch_isinstance_for_vllm_base_layer,
|
||||||
@@ -217,6 +218,10 @@ class ModelRunner:
|
|||||||
"use_mla_backend": self.use_mla_backend,
|
"use_mla_backend": self.use_mla_backend,
|
||||||
"speculative_algorithm": self.spec_algorithm,
|
"speculative_algorithm": self.spec_algorithm,
|
||||||
}
|
}
|
||||||
|
| {
|
||||||
|
"moe_a2a_backend": MoeA2ABackend(server_args.moe_a2a_backend),
|
||||||
|
"deepep_mode": DeepEPMode(server_args.deepep_mode),
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# CPU offload
|
# CPU offload
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from tqdm import tqdm
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from sglang.srt.distributed import (
|
from sglang.srt.distributed import (
|
||||||
|
get_moe_expert_parallel_world_size,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
parallel_state,
|
parallel_state,
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
@@ -61,7 +62,6 @@ from sglang.srt.layers.moe.ep_moe.layer import (
|
|||||||
get_moe_impl_class,
|
get_moe_impl_class,
|
||||||
should_use_flashinfer_trtllm_moe,
|
should_use_flashinfer_trtllm_moe,
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
|
|
||||||
from sglang.srt.layers.moe.topk import TopK
|
from sglang.srt.layers.moe.topk import TopK
|
||||||
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
||||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||||
@@ -96,7 +96,6 @@ from sglang.srt.two_batch_overlap import (
|
|||||||
)
|
)
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import (
|
||||||
BumpAllocator,
|
BumpAllocator,
|
||||||
DeepEPMode,
|
|
||||||
LazyValue,
|
LazyValue,
|
||||||
add_prefix,
|
add_prefix,
|
||||||
bind_or_assign,
|
bind_or_assign,
|
||||||
@@ -333,15 +332,14 @@ class DeepseekV2MoE(nn.Module):
|
|||||||
routed_scaling_factor=self.routed_scaling_factor,
|
routed_scaling_factor=self.routed_scaling_factor,
|
||||||
prefix=add_prefix("experts", prefix),
|
prefix=add_prefix("experts", prefix),
|
||||||
**(
|
**(
|
||||||
dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
|
dict(deepep_mode=global_server_args_dict["deepep_mode"])
|
||||||
if global_server_args_dict["enable_deepep_moe"]
|
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
||||||
else {}
|
else {}
|
||||||
),
|
),
|
||||||
# Additional args for FusedMoE
|
# Additional args for FusedMoE
|
||||||
**(
|
**(
|
||||||
dict(
|
dict(
|
||||||
enable_flashinfer_cutlass_moe=True,
|
enable_flashinfer_cutlass_moe=True,
|
||||||
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
|
||||||
)
|
)
|
||||||
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
||||||
else {}
|
else {}
|
||||||
@@ -374,7 +372,7 @@ class DeepseekV2MoE(nn.Module):
|
|||||||
prefix=add_prefix("shared_experts", prefix),
|
prefix=add_prefix("shared_experts", prefix),
|
||||||
**(
|
**(
|
||||||
dict(tp_rank=0, tp_size=1)
|
dict(tp_rank=0, tp_size=1)
|
||||||
if global_server_args_dict["enable_deepep_moe"]
|
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
||||||
else {}
|
else {}
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@@ -404,9 +402,9 @@ class DeepseekV2MoE(nn.Module):
|
|||||||
|
|
||||||
self.top_k = config.num_experts_per_tok
|
self.top_k = config.num_experts_per_tok
|
||||||
|
|
||||||
if global_server_args_dict["enable_deepep_moe"]:
|
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
||||||
# TODO: we will support tp < ep in the future
|
# TODO: we will support tp < ep in the future
|
||||||
self.ep_size = get_tensor_model_parallel_world_size()
|
self.ep_size = get_moe_expert_parallel_world_size()
|
||||||
self.num_experts = (
|
self.num_experts = (
|
||||||
config.n_routed_experts
|
config.n_routed_experts
|
||||||
+ global_server_args_dict["ep_num_redundant_experts"]
|
+ global_server_args_dict["ep_num_redundant_experts"]
|
||||||
@@ -428,12 +426,12 @@ class DeepseekV2MoE(nn.Module):
|
|||||||
num_local_experts=config.n_routed_experts // self.tp_size,
|
num_local_experts=config.n_routed_experts // self.tp_size,
|
||||||
hidden_size=config.hidden_size,
|
hidden_size=config.hidden_size,
|
||||||
params_dtype=config.torch_dtype,
|
params_dtype=config.torch_dtype,
|
||||||
deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
|
deepep_mode=global_server_args_dict["deepep_mode"],
|
||||||
async_finish=True,
|
async_finish=True,
|
||||||
return_recv_hook=True,
|
return_recv_hook=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._enable_deepep_moe = global_server_args_dict["enable_deepep_moe"]
|
self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
|
||||||
|
|
||||||
def get_moe_weights(self):
|
def get_moe_weights(self):
|
||||||
return [
|
return [
|
||||||
@@ -2104,11 +2102,8 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|||||||
or self.config.n_shared_experts != 1
|
or self.config.n_shared_experts != 1
|
||||||
):
|
):
|
||||||
disable_reason = "Only Deepseek V3/R1 on NV-platform with capability >= 80 can use shared experts fusion optimization."
|
disable_reason = "Only Deepseek V3/R1 on NV-platform with capability >= 80 can use shared experts fusion optimization."
|
||||||
elif (
|
elif get_moe_expert_parallel_world_size() > 1:
|
||||||
global_server_args_dict["enable_deepep_moe"]
|
disable_reason = "Deepseek V3/R1 can not use shared experts fusion optimization under expert parallelism."
|
||||||
or global_server_args_dict["enable_ep_moe"]
|
|
||||||
):
|
|
||||||
disable_reason = "Deepseek V3/R1 can not use shared experts fusion optimization when in deepep_moe or ep_moe mode."
|
|
||||||
|
|
||||||
if disable_reason is not None:
|
if disable_reason is not None:
|
||||||
global_server_args_dict["disable_shared_experts_fusion"] = True
|
global_server_args_dict["disable_shared_experts_fusion"] = True
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ from torch import nn
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from sglang.srt.distributed import (
|
from sglang.srt.distributed import (
|
||||||
|
get_moe_expert_parallel_world_size,
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
parallel_state,
|
parallel_state,
|
||||||
@@ -50,7 +51,6 @@ from sglang.srt.layers.linear import (
|
|||||||
)
|
)
|
||||||
from sglang.srt.layers.logits_processor import LogitsProcessor
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
||||||
from sglang.srt.layers.moe.ep_moe.layer import (
|
from sglang.srt.layers.moe.ep_moe.layer import (
|
||||||
DeepEPMoE,
|
|
||||||
get_moe_impl_class,
|
get_moe_impl_class,
|
||||||
should_use_flashinfer_trtllm_moe,
|
should_use_flashinfer_trtllm_moe,
|
||||||
)
|
)
|
||||||
@@ -83,7 +83,6 @@ from sglang.srt.two_batch_overlap import (
|
|||||||
)
|
)
|
||||||
from sglang.srt.utils import (
|
from sglang.srt.utils import (
|
||||||
BumpAllocator,
|
BumpAllocator,
|
||||||
DeepEPMode,
|
|
||||||
LazyValue,
|
LazyValue,
|
||||||
add_prefix,
|
add_prefix,
|
||||||
bind_or_assign,
|
bind_or_assign,
|
||||||
@@ -443,15 +442,14 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|||||||
routed_scaling_factor=self.routed_scaling_factor,
|
routed_scaling_factor=self.routed_scaling_factor,
|
||||||
prefix=add_prefix("experts", prefix),
|
prefix=add_prefix("experts", prefix),
|
||||||
**(
|
**(
|
||||||
dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
|
dict(deepep_mode=global_server_args_dict["deepep_mode"])
|
||||||
if global_server_args_dict["enable_deepep_moe"]
|
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
||||||
else {}
|
else {}
|
||||||
),
|
),
|
||||||
# Additional args for FusedMoE
|
# Additional args for FusedMoE
|
||||||
**(
|
**(
|
||||||
dict(
|
dict(
|
||||||
enable_flashinfer_cutlass_moe=True,
|
enable_flashinfer_cutlass_moe=True,
|
||||||
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
|
||||||
)
|
)
|
||||||
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
||||||
else {}
|
else {}
|
||||||
@@ -484,7 +482,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|||||||
prefix=add_prefix("shared_experts", prefix),
|
prefix=add_prefix("shared_experts", prefix),
|
||||||
**(
|
**(
|
||||||
dict(tp_rank=0, tp_size=1)
|
dict(tp_rank=0, tp_size=1)
|
||||||
if global_server_args_dict["enable_deepep_moe"]
|
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
||||||
else {}
|
else {}
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@@ -502,9 +500,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|||||||
|
|
||||||
self.top_k = config.num_experts_per_tok
|
self.top_k = config.num_experts_per_tok
|
||||||
|
|
||||||
if global_server_args_dict["enable_deepep_moe"]:
|
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
||||||
# TODO: we will support tp < ep in the future
|
# TODO: we will support tp < ep in the future
|
||||||
self.ep_size = get_tensor_model_parallel_world_size()
|
self.ep_size = get_moe_expert_parallel_world_size()
|
||||||
self.num_experts = (
|
self.num_experts = (
|
||||||
config.n_routed_experts
|
config.n_routed_experts
|
||||||
+ global_server_args_dict["ep_num_redundant_experts"]
|
+ global_server_args_dict["ep_num_redundant_experts"]
|
||||||
@@ -526,12 +524,12 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|||||||
num_local_experts=config.n_routed_experts // self.tp_size,
|
num_local_experts=config.n_routed_experts // self.tp_size,
|
||||||
hidden_size=config.hidden_size,
|
hidden_size=config.hidden_size,
|
||||||
params_dtype=config.torch_dtype,
|
params_dtype=config.torch_dtype,
|
||||||
deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
|
deepep_mode=global_server_args_dict["deepep_mode"],
|
||||||
async_finish=True,
|
async_finish=True,
|
||||||
return_recv_hook=True,
|
return_recv_hook=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._enable_deepep_moe = global_server_args_dict["enable_deepep_moe"]
|
self._enable_deepep_moe = global_server_args_dict["moe_a2a_backend"].is_deepep()
|
||||||
|
|
||||||
|
|
||||||
class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
|
class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
|
||||||
@@ -737,11 +735,8 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|||||||
or self.config.n_shared_experts != 1
|
or self.config.n_shared_experts != 1
|
||||||
):
|
):
|
||||||
disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
|
disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
|
||||||
elif (
|
elif get_moe_expert_parallel_world_size() > 1:
|
||||||
global_server_args_dict["enable_deepep_moe"]
|
disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
|
||||||
or global_server_args_dict["enable_ep_moe"]
|
|
||||||
):
|
|
||||||
disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization when in deepep_moe or ep_moe mode."
|
|
||||||
|
|
||||||
if disable_reason is not None:
|
if disable_reason is not None:
|
||||||
global_server_args_dict["disable_shared_experts_fusion"] = True
|
global_server_args_dict["disable_shared_experts_fusion"] = True
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from torch import nn
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from sglang.srt.distributed import (
|
from sglang.srt.distributed import (
|
||||||
|
get_moe_expert_parallel_world_size,
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
tensor_model_parallel_all_gather,
|
tensor_model_parallel_all_gather,
|
||||||
@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
|
|||||||
)
|
)
|
||||||
|
|
||||||
kwargs = {}
|
kwargs = {}
|
||||||
if global_server_args_dict["enable_ep_moe"]:
|
if get_moe_expert_parallel_world_size() > 1:
|
||||||
MoEImpl = EPMoE
|
MoEImpl = EPMoE
|
||||||
else:
|
else:
|
||||||
MoEImpl = FusedMoE
|
MoEImpl = FusedMoE
|
||||||
@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):
|
|||||||
|
|
||||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||||
# (param_name, weight_name, expert_id, shard_id)
|
# (param_name, weight_name, expert_id, shard_id)
|
||||||
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
|
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
||||||
expert_params_mapping = MoEImpl.make_expert_params_mapping(
|
|
||||||
ckpt_gate_proj_name="w1",
|
ckpt_gate_proj_name="w1",
|
||||||
ckpt_down_proj_name="w2",
|
ckpt_down_proj_name="w2",
|
||||||
ckpt_up_proj_name="w3",
|
ckpt_up_proj_name="w3",
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from torch import nn
|
|||||||
from transformers import MixtralConfig
|
from transformers import MixtralConfig
|
||||||
|
|
||||||
from sglang.srt.distributed import (
|
from sglang.srt.distributed import (
|
||||||
|
get_moe_expert_parallel_world_size,
|
||||||
get_pp_group,
|
get_pp_group,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
tensor_model_parallel_all_reduce,
|
tensor_model_parallel_all_reduce,
|
||||||
@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
|
|||||||
renormalize=True,
|
renormalize=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
|
MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
|
||||||
self.experts = MoEImpl(
|
self.experts = MoEImpl(
|
||||||
num_experts=num_experts,
|
num_experts=num_experts,
|
||||||
top_k=top_k,
|
top_k=top_k,
|
||||||
@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):
|
|||||||
|
|
||||||
# Params for weights, fp8 weight scales, fp8 activation scales
|
# Params for weights, fp8 weight scales, fp8 activation scales
|
||||||
# (param_name, weight_name, expert_id, shard_id)
|
# (param_name, weight_name, expert_id, shard_id)
|
||||||
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
|
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
||||||
expert_params_mapping = MoEImpl.make_expert_params_mapping(
|
|
||||||
ckpt_gate_proj_name="w1",
|
ckpt_gate_proj_name="w1",
|
||||||
ckpt_down_proj_name="w2",
|
ckpt_down_proj_name="w2",
|
||||||
ckpt_up_proj_name="w3",
|
ckpt_up_proj_name="w3",
|
||||||
|
|||||||
@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|||||||
**(
|
**(
|
||||||
dict(
|
dict(
|
||||||
enable_flashinfer_cutlass_moe=True,
|
enable_flashinfer_cutlass_moe=True,
|
||||||
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
|
||||||
)
|
)
|
||||||
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
||||||
else {}
|
else {}
|
||||||
@@ -616,9 +615,7 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|||||||
("gate_up_proj", "up_proj", 1),
|
("gate_up_proj", "up_proj", 1),
|
||||||
]
|
]
|
||||||
|
|
||||||
MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
|
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
||||||
|
|
||||||
expert_params_mapping = MoEImpl.make_expert_params_mapping(
|
|
||||||
ckpt_gate_proj_name="gate_proj",
|
ckpt_gate_proj_name="gate_proj",
|
||||||
ckpt_down_proj_name="down_proj",
|
ckpt_down_proj_name="down_proj",
|
||||||
ckpt_up_proj_name="up_proj",
|
ckpt_up_proj_name="up_proj",
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ import torch
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from sglang.srt.distributed import (
|
from sglang.srt.distributed import (
|
||||||
|
get_moe_expert_parallel_world_size,
|
||||||
get_pp_group,
|
get_pp_group,
|
||||||
get_tensor_model_parallel_rank,
|
get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
|
|||||||
)
|
)
|
||||||
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
||||||
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
||||||
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
|
|
||||||
from sglang.srt.layers.moe.topk import TopK
|
from sglang.srt.layers.moe.topk import TopK
|
||||||
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
||||||
from sglang.srt.layers.radix_attention import RadixAttention
|
from sglang.srt.layers.radix_attention import RadixAttention
|
||||||
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|||||||
from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
|
from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
|
||||||
from sglang.srt.models.qwen2_moe import Qwen2MoeModel
|
from sglang.srt.models.qwen2_moe import Qwen2MoeModel
|
||||||
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
|
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
|
||||||
from sglang.srt.utils import DeepEPMode, add_prefix, is_cuda, is_non_idle_and_non_empty
|
from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
|
||||||
|
|
||||||
Qwen3MoeConfig = None
|
Qwen3MoeConfig = None
|
||||||
|
|
||||||
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
prefix=add_prefix("experts", prefix),
|
prefix=add_prefix("experts", prefix),
|
||||||
**(
|
**(
|
||||||
dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
|
dict(deepep_mode=global_server_args_dict["deepep_mode"])
|
||||||
if global_server_args_dict["enable_deepep_moe"]
|
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
||||||
else {}
|
else {}
|
||||||
),
|
),
|
||||||
# Additional args for FusedMoE
|
# Additional args for FusedMoE
|
||||||
**(
|
**(
|
||||||
dict(
|
dict(
|
||||||
enable_flashinfer_cutlass_moe=True,
|
enable_flashinfer_cutlass_moe=True,
|
||||||
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
|
||||||
)
|
)
|
||||||
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
||||||
else {}
|
else {}
|
||||||
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
prefix=add_prefix("gate", prefix),
|
prefix=add_prefix("gate", prefix),
|
||||||
)
|
)
|
||||||
|
|
||||||
if global_server_args_dict["enable_deepep_moe"]:
|
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
||||||
# TODO: we will support tp < ep in the future
|
# TODO: we will support tp < ep in the future
|
||||||
self.ep_size = get_tensor_model_parallel_world_size()
|
self.ep_size = get_moe_expert_parallel_world_size()
|
||||||
self.num_experts = (
|
self.num_experts = (
|
||||||
config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
|
config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
|
||||||
)
|
)
|
||||||
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|||||||
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
|
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
|
|
||||||
if not global_server_args_dict["enable_deepep_moe"]:
|
if not global_server_args_dict["moe_a2a_backend"].is_deepep():
|
||||||
return self.forward_normal(hidden_states)
|
return self.forward_normal(hidden_states)
|
||||||
else:
|
else:
|
||||||
return self.forward_deepep(hidden_states, forward_batch)
|
return self.forward_deepep(hidden_states, forward_batch)
|
||||||
|
|||||||
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
|
|||||||
prefix=add_prefix("gate", prefix),
|
prefix=add_prefix("gate", prefix),
|
||||||
)
|
)
|
||||||
|
|
||||||
if global_server_args_dict["enable_deepep_moe"]:
|
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
||||||
raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
|
raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
|
||||||
|
|
||||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from typing import List, Optional
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from sglang.srt import operations
|
from sglang.srt import operations
|
||||||
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
|
from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
||||||
from sglang.srt.operations import Operation
|
from sglang.srt.operations import Operation
|
||||||
|
|
||||||
|
|||||||
@@ -172,12 +172,11 @@ class ServerArgs:
|
|||||||
|
|
||||||
# Expert parallelism
|
# Expert parallelism
|
||||||
ep_size: int = 1
|
ep_size: int = 1
|
||||||
enable_ep_moe: bool = False
|
moe_a2a_backend: Optional[Literal["deepep"]] = None
|
||||||
enable_deepep_moe: bool = False
|
|
||||||
enable_flashinfer_cutlass_moe: bool = False
|
enable_flashinfer_cutlass_moe: bool = False
|
||||||
enable_flashinfer_trtllm_moe: bool = False
|
enable_flashinfer_trtllm_moe: bool = False
|
||||||
enable_flashinfer_allreduce_fusion: bool = False
|
enable_flashinfer_allreduce_fusion: bool = False
|
||||||
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
||||||
ep_num_redundant_experts: int = 0
|
ep_num_redundant_experts: int = 0
|
||||||
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
||||||
init_expert_location: str = "trivial"
|
init_expert_location: str = "trivial"
|
||||||
@@ -272,7 +271,27 @@ class ServerArgs:
|
|||||||
enable_pdmux: bool = False
|
enable_pdmux: bool = False
|
||||||
sm_group_num: int = 3
|
sm_group_num: int = 3
|
||||||
|
|
||||||
|
# Deprecated arguments
|
||||||
|
enable_ep_moe: bool = False
|
||||||
|
enable_deepep_moe: bool = False
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
|
|
||||||
|
# Check deprecated arguments
|
||||||
|
def print_deprecated_warning(message: str):
|
||||||
|
logger.warning(f"\033[33m{message}\033[0m")
|
||||||
|
|
||||||
|
if self.enable_ep_moe:
|
||||||
|
self.ep_size = self.tp_size
|
||||||
|
print_deprecated_warning(
|
||||||
|
"NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
|
||||||
|
)
|
||||||
|
if self.enable_deepep_moe:
|
||||||
|
self.moe_a2a_backend = "deepep"
|
||||||
|
print_deprecated_warning(
|
||||||
|
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
|
||||||
|
)
|
||||||
|
|
||||||
# Set missing default values
|
# Set missing default values
|
||||||
if self.tokenizer_path is None:
|
if self.tokenizer_path is None:
|
||||||
self.tokenizer_path = self.model_path
|
self.tokenizer_path = self.model_path
|
||||||
@@ -455,14 +474,13 @@ class ServerArgs:
|
|||||||
self.quantization == "modelopt_fp4"
|
self.quantization == "modelopt_fp4"
|
||||||
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
||||||
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
||||||
if self.enable_ep_moe:
|
assert self.ep_size in [
|
||||||
self.ep_size = self.tp_size
|
1,
|
||||||
logger.warning(
|
self.tp_size,
|
||||||
f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
||||||
)
|
|
||||||
|
|
||||||
# DeepEP MoE
|
# DeepEP MoE
|
||||||
if self.enable_deepep_moe:
|
if self.moe_a2a_backend == "deepep":
|
||||||
if self.deepep_mode == "normal":
|
if self.deepep_mode == "normal":
|
||||||
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
||||||
self.disable_cuda_graph = True
|
self.disable_cuda_graph = True
|
||||||
@@ -486,7 +504,7 @@ class ServerArgs:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.enable_eplb:
|
if self.enable_eplb:
|
||||||
assert self.enable_ep_moe or self.enable_deepep_moe
|
assert self.ep_size > 1 or self.moe_a2a_backend is not None
|
||||||
|
|
||||||
if self.enable_expert_distribution_metrics and (
|
if self.enable_expert_distribution_metrics and (
|
||||||
self.expert_distribution_recorder_mode is None
|
self.expert_distribution_recorder_mode is None
|
||||||
@@ -1354,30 +1372,27 @@ class ServerArgs:
|
|||||||
help="The expert parallelism size.",
|
help="The expert parallelism size.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-ep-moe",
|
"--moe-a2a-backend",
|
||||||
action="store_true",
|
type=str,
|
||||||
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
choices=["deepep"],
|
||||||
|
default=ServerArgs.moe_a2a_backend,
|
||||||
|
help="Choose the backend for MoE A2A.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-flashinfer-cutlass-moe",
|
"--enable-flashinfer-cutlass-moe",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
|
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-flashinfer-trtllm-moe",
|
"--enable-flashinfer-trtllm-moe",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
|
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--enable-flashinfer-allreduce-fusion",
|
"--enable-flashinfer-allreduce-fusion",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
|
help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--enable-deepep-moe",
|
|
||||||
action="store_true",
|
|
||||||
help="Enabling DeepEP MoE implementation for EP MoE.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--deepep-mode",
|
"--deepep-mode",
|
||||||
type=str,
|
type=str,
|
||||||
@@ -1839,6 +1854,18 @@ class ServerArgs:
|
|||||||
help="Disable mmap while loading weight using safetensors.",
|
help="Disable mmap while loading weight using safetensors.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Deprecated arguments
|
||||||
|
parser.add_argument(
|
||||||
|
"--enable-ep-moe",
|
||||||
|
action="store_true",
|
||||||
|
help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--enable-deepep-moe",
|
||||||
|
action="store_true",
|
||||||
|
help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_cli_args(cls, args: argparse.Namespace):
|
def from_cli_args(cls, args: argparse.Namespace):
|
||||||
args.tp_size = args.tensor_parallel_size
|
args.tp_size = args.tensor_parallel_size
|
||||||
|
|||||||
@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
|
|||||||
CommunicateSummableTensorPairFn,
|
CommunicateSummableTensorPairFn,
|
||||||
ScatterMode,
|
ScatterMode,
|
||||||
)
|
)
|
||||||
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
|
from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
|
||||||
|
from sglang.srt.layers.moe.utils import DeepEPMode
|
||||||
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
||||||
from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
|
from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
|
||||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
|
||||||
from sglang.srt.operations import execute_operations, execute_overlapped_operations
|
from sglang.srt.operations import execute_operations, execute_overlapped_operations
|
||||||
from sglang.srt.operations_strategy import OperationsStrategy
|
from sglang.srt.operations_strategy import OperationsStrategy
|
||||||
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
||||||
from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
|
from sglang.srt.utils import BumpAllocator, get_bool_env_var
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
|
from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
|
||||||
|
|
||||||
_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
|
_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
|
||||||
|
|
||||||
@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
|
|||||||
and not local_batch.forward_mode.is_target_verify()
|
and not local_batch.forward_mode.is_target_verify()
|
||||||
)
|
)
|
||||||
and enable_deepep_moe
|
and enable_deepep_moe
|
||||||
and (resolved_deepep_mode == DeepEPMode.low_latency)
|
and (resolved_deepep_mode == DeepEPMode.LOW_LATENCY)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.local_tbo_split_seq_index = 0
|
self.local_tbo_split_seq_index = 0
|
||||||
|
|||||||
@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
|
|||||||
return [nested_list]
|
return [nested_list]
|
||||||
|
|
||||||
|
|
||||||
class DeepEPMode(Enum):
|
|
||||||
normal = "normal"
|
|
||||||
low_latency = "low_latency"
|
|
||||||
auto = "auto"
|
|
||||||
|
|
||||||
def enable_normal(self):
|
|
||||||
return self in [DeepEPMode.normal, DeepEPMode.auto]
|
|
||||||
|
|
||||||
def enable_low_latency(self):
|
|
||||||
return self in [DeepEPMode.low_latency, DeepEPMode.auto]
|
|
||||||
|
|
||||||
def resolve(self, is_extend_in_batch: bool):
|
|
||||||
if self != DeepEPMode.auto:
|
|
||||||
return self
|
|
||||||
|
|
||||||
if is_extend_in_batch:
|
|
||||||
return DeepEPMode.normal
|
|
||||||
else:
|
|
||||||
return DeepEPMode.low_latency
|
|
||||||
|
|
||||||
|
|
||||||
def is_non_idle_and_non_empty(forward_mode, hidden_states):
|
def is_non_idle_and_non_empty(forward_mode, hidden_states):
|
||||||
return (
|
return (
|
||||||
(forward_mode is not None)
|
(forward_mode is not None)
|
||||||
@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
|
|||||||
return True
|
return True
|
||||||
elif not server_args.enable_dp_lm_head:
|
elif not server_args.enable_dp_lm_head:
|
||||||
return True
|
return True
|
||||||
elif not server_args.enable_deepep_moe:
|
elif server_args.moe_a2a_backend is None:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return (
|
return (
|
||||||
@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args):
|
|||||||
Check if the input of attention is scattered.
|
Check if the input of attention is scattered.
|
||||||
"""
|
"""
|
||||||
assert server_args.moe_dense_tp_size in [1, None]
|
assert server_args.moe_dense_tp_size in [1, None]
|
||||||
if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
|
if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
|
||||||
if server_args.enable_dp_attention:
|
if server_args.enable_dp_attention:
|
||||||
return server_args.dp_size < server_args.tp_size
|
return server_args.dp_size < server_args.tp_size
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -499,7 +499,6 @@ class SRTRunner:
|
|||||||
chunked_prefill_size: Optional[int] = None,
|
chunked_prefill_size: Optional[int] = None,
|
||||||
dp_size: int = 1,
|
dp_size: int = 1,
|
||||||
tokenizer_path: Optional[str] = None,
|
tokenizer_path: Optional[str] = None,
|
||||||
enable_ep_moe: bool = False,
|
|
||||||
mem_fraction_static: float = 0.65,
|
mem_fraction_static: float = 0.65,
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
speculative_draft_model_path: Optional[str] = None,
|
speculative_draft_model_path: Optional[str] = None,
|
||||||
@@ -550,7 +549,6 @@ class SRTRunner:
|
|||||||
enable_dp_attention=enable_dp_attention,
|
enable_dp_attention=enable_dp_attention,
|
||||||
dp_size=dp_size,
|
dp_size=dp_size,
|
||||||
tokenizer_path=tokenizer_path,
|
tokenizer_path=tokenizer_path,
|
||||||
enable_ep_moe=enable_ep_moe,
|
|
||||||
disable_overlap_schedule=disable_overlap_schedule,
|
disable_overlap_schedule=disable_overlap_schedule,
|
||||||
cuda_graph_max_bs=cuda_graph_max_bs,
|
cuda_graph_max_bs=cuda_graph_max_bs,
|
||||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||||
|
|||||||
@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--enable-two-batch-overlap",
|
"--enable-two-batch-overlap",
|
||||||
"--ep-num-redundant-experts",
|
"--ep-num-redundant-experts",
|
||||||
"32",
|
"32",
|
||||||
@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--enable-two-batch-overlap",
|
"--enable-two-batch-overlap",
|
||||||
"--ep-num-redundant-experts",
|
"--ep-num-redundant-experts",
|
||||||
"32",
|
"32",
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"2",
|
"2",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -118,7 +120,8 @@ class TestTP(CustomTestCase):
|
|||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--tp",
|
"--tp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase):
|
|||||||
"4",
|
"4",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--enable-two-batch-overlap",
|
"--enable-two-batch-overlap",
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"2",
|
"2",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase):
|
|||||||
"--dp-size",
|
"--dp-size",
|
||||||
"4",
|
"4",
|
||||||
"--enable-two-batch-overlap",
|
"--enable-two-batch-overlap",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--speculative-algorithm",
|
"--speculative-algorithm",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
|
|||||||
@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"2",
|
"2",
|
||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--deepep-mode",
|
"--deepep-mode",
|
||||||
"normal",
|
"normal",
|
||||||
"--disable-cuda-graph",
|
"--disable-cuda-graph",
|
||||||
@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase):
|
|||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
ep_num_redundant_experts=4,
|
ep_num_redundant_experts=4,
|
||||||
enable_dp_attention=True,
|
enable_dp_attention=True,
|
||||||
enable_deepep_moe=True,
|
moe_a2a_backend="deepep",
|
||||||
deepep_mode="normal",
|
|
||||||
disable_cuda_graph=True,
|
disable_cuda_graph=True,
|
||||||
expert_distribution_recorder_mode="stat",
|
expert_distribution_recorder_mode="stat",
|
||||||
tp_size=2,
|
tp_size=2,
|
||||||
|
|||||||
@@ -407,9 +407,8 @@ class Test10(CustomTestCase):
|
|||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--tp",
|
"--tp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -449,9 +448,8 @@ class Test11(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -491,9 +489,8 @@ class Test12(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -532,9 +529,8 @@ class Test13(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -576,9 +572,8 @@ class Test14(CustomTestCase):
|
|||||||
"4",
|
"4",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -620,9 +615,8 @@ class Test15(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -663,9 +657,8 @@ class Test16(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -706,9 +699,8 @@ class Test17(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -751,9 +743,8 @@ class Test18(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -796,9 +787,8 @@ class Test19(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
@@ -835,7 +825,8 @@ class Test20(CustomTestCase):
|
|||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--tp",
|
"--tp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -873,7 +864,8 @@ class Test21(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -911,7 +903,8 @@ class Test22(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -948,7 +941,8 @@ class Test23(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -988,7 +982,8 @@ class Test24(CustomTestCase):
|
|||||||
"4",
|
"4",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase):
|
|||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--tp",
|
"--tp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase):
|
|||||||
"4",
|
"4",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
"--deepep-mode",
|
"deepep",
|
||||||
"auto",
|
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"32",
|
"32",
|
||||||
"--max-running-requests",
|
"--max-running-requests",
|
||||||
@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase):
|
|||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--tp",
|
"--tp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase):
|
|||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase):
|
|||||||
"4",
|
"4",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase):
|
|||||||
"8",
|
"8",
|
||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"4",
|
"4",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase):
|
|||||||
"--moe-dense-tp-size",
|
"--moe-dense-tp-size",
|
||||||
"1",
|
"1",
|
||||||
"--enable-dp-lm-head",
|
"--enable-dp-lm-head",
|
||||||
"--enable-ep-moe",
|
"--ep",
|
||||||
|
"8",
|
||||||
"--speculative-algo",
|
"--speculative-algo",
|
||||||
"EAGLE",
|
"EAGLE",
|
||||||
"--speculative-draft",
|
"--speculative-draft",
|
||||||
|
|||||||
@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase):
|
|||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--tp",
|
"--tp",
|
||||||
"2",
|
"2",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--disable-cuda-graph",
|
"--disable-cuda-graph",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"2",
|
"2",
|
||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--deepep-mode",
|
"--deepep-mode",
|
||||||
"normal",
|
"normal",
|
||||||
"--disable-cuda-graph",
|
"--disable-cuda-graph",
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
|
|||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--tp",
|
"--tp",
|
||||||
"8",
|
"8",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--cuda-graph-max-bs",
|
"--cuda-graph-max-bs",
|
||||||
"128",
|
"128",
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase):
|
|||||||
"2",
|
"2",
|
||||||
"--ep-size",
|
"--ep-size",
|
||||||
"2",
|
"2",
|
||||||
"--enable-ep-moe",
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase):
|
|||||||
"2",
|
"2",
|
||||||
"--ep-size",
|
"--ep-size",
|
||||||
"2",
|
"2",
|
||||||
"--enable-ep-moe",
|
|
||||||
"--quantization",
|
"--quantization",
|
||||||
"fp8",
|
"fp8",
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"2",
|
"2",
|
||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--deepep-mode",
|
"--deepep-mode",
|
||||||
"normal",
|
"normal",
|
||||||
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
|
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
|
||||||
@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
|
|||||||
"--dp",
|
"--dp",
|
||||||
"2",
|
"2",
|
||||||
"--enable-dp-attention",
|
"--enable-dp-attention",
|
||||||
"--enable-deepep-moe",
|
"--moe-a2a-backend",
|
||||||
|
"deepep",
|
||||||
"--deepep-mode",
|
"--deepep-mode",
|
||||||
"normal",
|
"normal",
|
||||||
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
|
"--disable-cuda-graph", # DeepEP normal does not support CUDA Graph
|
||||||
|
|||||||
Reference in New Issue
Block a user