Add torchao quant (int4/int8/fp8) to llama models (#1341)
Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
This commit is contained in:
@@ -97,6 +97,7 @@ class ModelRunner:
|
||||
"disable_flashinfer_sampling": server_args.disable_flashinfer_sampling,
|
||||
"triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
|
||||
"enable_mla": server_args.enable_mla,
|
||||
"torchao_config": server_args.torchao_config,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user