diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 54b532b4a..1ed3d6880 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -183,7 +183,7 @@ class ServerArgs: enable_flashmla: bool = False flashinfer_mla_disable_ragged: bool = False warmups: Optional[str] = None - n_share_experts_fusion: Optional[int] = None + n_share_experts_fusion: int = 0 disable_shared_experts_fusion: bool = False # Debug tensor dumps @@ -1110,7 +1110,7 @@ class ServerArgs: parser.add_argument( "--n-share-experts-fusion", type=int, - default=None, + default=0, help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1 " "we use tp_size by default.", )