Add integration with gemlite weight only quant (#2528)

This commit is contained in:
Jerry Zhang
2024-12-20 08:25:25 -08:00
committed by GitHub
parent d95a5f5bf5
commit feb2b768ba
4 changed files with 61 additions and 1 deletions

View File

@@ -385,6 +385,19 @@ def latency_test(
8, # shorter decoding to speed up the warmup
server_args.device,
)
try:
import os
import pwd
from gemlite.core import GemLiteLinearTriton
GemLiteLinearTriton.cache_config(
f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
)
except ImportError:
pass
rank_print("Benchmark ...")
# Run the sweep