Enable Nvidia's ModelOpt fp8 quantized models (#2535)

2025-01-06 14:54:52 -08:00
parent b8574f6953
commit 287427e2e6
5 changed files with 185 additions and 0 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -149,6 +149,7 @@ class ServerArgs:
    torch_compile_max_bs: int = 32
    cuda_graph_max_bs: Optional[int] = None
    torchao_config: str = ""
+    modelopt_config: str = ""
    enable_nan_detection: bool = False
    enable_p2p_check: bool = False
    triton_attention_reduce_in_fp32: bool = False
@@ -361,6 +362,7 @@ class ServerArgs:
                "awq_marlin",
                "bitsandbytes",
                "gguf",
+                "modelopt",
            ],
            help="The quantization method.",
        )
@@ -808,6 +810,12 @@ class ServerArgs:
            default=ServerArgs.torchao_config,
            help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row",
        )
+        parser.add_argument(
+            "--modelopt-config",
+            type=str,
+            default=ServerArgs.modelopt_config,
+            help="Optimize the model with nvidia-modelopt. Experimental feature. Current choices are: fp8",
+        )
        parser.add_argument(
            "--enable-nan-detection",
            action="store_true",