Add torchao quant (int4/int8/fp8) to llama models (#1341)
Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
This commit is contained in:
@@ -95,6 +95,7 @@ class ServerArgs:
|
||||
disable_custom_all_reduce: bool = False
|
||||
enable_mixed_chunk: bool = False
|
||||
enable_torch_compile: bool = False
|
||||
torchao_config: str = ""
|
||||
enable_p2p_check: bool = False
|
||||
enable_mla: bool = False
|
||||
triton_attention_reduce_in_fp32: bool = False
|
||||
@@ -443,7 +444,13 @@ class ServerArgs:
|
||||
parser.add_argument(
|
||||
"--enable-torch-compile",
|
||||
action="store_true",
|
||||
help="Optimize the model with torch.compile, experimental feature.",
|
||||
help="Optimize the model with torch.compile. Experimental feature.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--torchao-config",
|
||||
type=str,
|
||||
default=ServerArgs.torchao_config,
|
||||
help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-p2p-check",
|
||||
|
||||
Reference in New Issue
Block a user