Fix oom issues with fp8 for llama (#1454)

2024-09-18 03:45:19 -07:00
parent aa2750beb3
commit 1acccb364a
8 changed files with 33 additions and 21 deletions
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -1,6 +1,7 @@
 import unittest

 from sglang.test.test_utils import (
+    DEFAULT_FP8_MODEL_NAME_FOR_TEST,
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
    is_in_ci,
@@ -59,6 +60,17 @@ class TestBenchServing(unittest.TestCase):
        if is_in_ci():
            assert res["output_throughput"] > 2600

+    def test_offline_throughput_default_fp8(self):
+        res = run_bench_serving(
+            model=DEFAULT_FP8_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            assert res["output_throughput"] > 3100
+
    def test_online_latency_default(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -12,8 +12,10 @@ from sglang.test.test_utils import (


 class TestChunkedPrefill(unittest.TestCase):
-    def run_mmlu(self, disable_radix_cache, enable_mixed_chunk):
-        other_args = ["--chunked-prefill-size", "32"]
+    def run_mmlu(
+        self, disable_radix_cache, enable_mixed_chunk, chunked_prefill_size=32
+    ):
+        other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
        if disable_radix_cache:
            other_args += ["--disable-radix-cache"]

@@ -55,6 +57,11 @@ class TestChunkedPrefill(unittest.TestCase):
    def test_mixed_chunked_prefill_without_radix_cache(self):
        self.run_mmlu(disable_radix_cache=True, enable_mixed_chunk=True)

+    def test_no_chunked_prefill(self):
+        self.run_mmlu(
+            disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1
+        )
+

 if __name__ == "__main__":
    unittest.main()