diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 72cc0f83a..41905e272 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -150,7 +150,6 @@ class ModelRunner: "enable_nan_detection": server_args.enable_nan_detection, "enable_dp_attention": server_args.enable_dp_attention, "enable_ep_moe": server_args.enable_ep_moe, - "modelopt_config": server_args.modelopt_config, } ) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 6950d00b3..5ed78dc5e 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -149,7 +149,6 @@ class ServerArgs: torch_compile_max_bs: int = 32 cuda_graph_max_bs: Optional[int] = None torchao_config: str = "" - modelopt_config: str = "" enable_nan_detection: bool = False enable_p2p_check: bool = False triton_attention_reduce_in_fp32: bool = False @@ -810,12 +809,6 @@ class ServerArgs: default=ServerArgs.torchao_config, help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-, fp8wo, fp8dq-per_tensor, fp8dq-per_row", ) - parser.add_argument( - "--modelopt-config", - type=str, - default=ServerArgs.modelopt_config, - help="Optimize the model with nvidia-modelopt. Experimental feature. Current choices are: fp8", - ) parser.add_argument( "--enable-nan-detection", action="store_true",