Add torchao quant (int4/int8/fp8) to llama models (#1341)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
2024-09-09 05:32:41 -07:00
parent e4d68afcf0
commit a7c47e0f02
10 changed files with 151 additions and 12 deletions
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -22,7 +22,7 @@ class TestTorchCompile(unittest.TestCase):
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--enable-torch-compile", "--disable-radix-cache"],
+            other_args=["--enable-torch-compile"],
        )

    @classmethod
@@ -34,12 +34,12 @@ class TestTorchCompile(unittest.TestCase):
            base_url=self.base_url,
            model=self.model,
            eval_name="mmlu",
-            num_examples=32,
+            num_examples=64,
            num_threads=32,
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.6
+        assert metrics["score"] >= 0.65

    def run_decode(self, max_new_tokens):
        response = requests.post(