Add torchao quant (int4/int8/fp8) to llama models (#1341)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
2024-09-09 05:32:41 -07:00
parent e4d68afcf0
commit a7c47e0f02
10 changed files with 151 additions and 12 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -95,6 +95,7 @@ class ServerArgs:
    disable_custom_all_reduce: bool = False
    enable_mixed_chunk: bool = False
    enable_torch_compile: bool = False
+    torchao_config: str = ""
    enable_p2p_check: bool = False
    enable_mla: bool = False
    triton_attention_reduce_in_fp32: bool = False
@@ -443,7 +444,13 @@ class ServerArgs:
        parser.add_argument(
            "--enable-torch-compile",
            action="store_true",
-            help="Optimize the model with torch.compile, experimental feature.",
+            help="Optimize the model with torch.compile. Experimental feature.",
+        )
+        parser.add_argument(
+            "--torchao-config",
+            type=str,
+            default=ServerArgs.torchao_config,
+            help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo",
        )
        parser.add_argument(
            "--enable-p2p-check",