Add torchao quant (int4/int8/fp8) to llama models (#1341)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
This commit is contained in:
Jerry Zhang
2024-09-09 05:32:41 -07:00
committed by GitHub
parent e4d68afcf0
commit a7c47e0f02
10 changed files with 151 additions and 12 deletions

View File

@@ -97,6 +97,7 @@ class ModelRunner:
"disable_flashinfer_sampling": server_args.disable_flashinfer_sampling,
"triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
"enable_mla": server_args.enable_mla,
"torchao_config": server_args.torchao_config,
}
)