Add torchao quant (int4/int8/fp8) to llama models (#1341)

Co-authored-by: Lianmin Zheng <lianminzheng@gmail.com>
2024-09-09 05:32:41 -07:00
parent e4d68afcf0
commit a7c47e0f02
10 changed files with 151 additions and 12 deletions
--- a/test/srt/test_eval_accuracy_mini.py
+++ b/test/srt/test_eval_accuracy_mini.py
@@ -29,12 +29,12 @@ class TestEvalAccuracyMini(unittest.TestCase):
            base_url=self.base_url,
            model=self.model,
            eval_name="mmlu",
-            num_examples=32,
+            num_examples=64,
            num_threads=32,
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.6
+        assert metrics["score"] >= 0.65


 if __name__ == "__main__":
--- a/test/srt/test_moe_eval_accuracy_large.py
+++ b/test/srt/test_moe_eval_accuracy_large.py
@@ -42,7 +42,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.62, f"{metrics}"
+        assert metrics["score"] >= 0.625, f"{metrics}"

    def test_human_eval(self):
        args = SimpleNamespace(
@@ -54,7 +54,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.42, f"{metrics}"
+        assert metrics["score"] >= 0.425, f"{metrics}"

    def test_mgsm_en(self):
        args = SimpleNamespace(
@@ -66,7 +66,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.62, f"{metrics}"
+        assert metrics["score"] >= 0.625, f"{metrics}"


 if __name__ == "__main__":
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -22,7 +22,7 @@ class TestTorchCompile(unittest.TestCase):
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--enable-torch-compile", "--disable-radix-cache"],
+            other_args=["--enable-torch-compile"],
        )

    @classmethod
@@ -34,12 +34,12 @@ class TestTorchCompile(unittest.TestCase):
            base_url=self.base_url,
            model=self.model,
            eval_name="mmlu",
-            num_examples=32,
+            num_examples=64,
            num_threads=32,
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.6
+        assert metrics["score"] >= 0.65

    def run_decode(self, max_new_tokens):
        response = requests.post(
--- a/test/srt/test_torchao.py
+++ b/test/srt/test_torchao.py
@@ -0,0 +1,73 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestTorchCompile(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--torchao-config", "int4wo-128"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.65
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        import time
+
+        max_tokens = 256
+
+        tic = time.time()
+        res = self.run_decode(max_tokens)
+        tok = time.time()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+        assert throughput >= 210
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_triton_attn_backend.py
+++ b/test/srt/test_triton_attn_backend.py
@@ -32,12 +32,12 @@ class TestTritonAttnBackend(unittest.TestCase):
            base_url=self.base_url,
            model=self.model,
            eval_name="mmlu",
-            num_examples=32,
+            num_examples=64,
            num_threads=32,
        )

        metrics = run_eval(args)
-        assert metrics["score"] >= 0.6
+        assert metrics["score"] >= 0.65


 if __name__ == "__main__":