Fix oom issues with fp8 for llama (#1454)

This commit is contained in:
Lianmin Zheng
2024-09-18 03:45:19 -07:00
committed by GitHub
parent aa2750beb3
commit 1acccb364a
8 changed files with 33 additions and 21 deletions

View File

@@ -1,6 +1,7 @@
import unittest
from sglang.test.test_utils import (
DEFAULT_FP8_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
is_in_ci,
@@ -59,6 +60,17 @@ class TestBenchServing(unittest.TestCase):
if is_in_ci():
assert res["output_throughput"] > 2600
def test_offline_throughput_default_fp8(self):
res = run_bench_serving(
model=DEFAULT_FP8_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=[],
)
if is_in_ci():
assert res["output_throughput"] > 3100
def test_online_latency_default(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,

View File

@@ -12,8 +12,10 @@ from sglang.test.test_utils import (
class TestChunkedPrefill(unittest.TestCase):
def run_mmlu(self, disable_radix_cache, enable_mixed_chunk):
other_args = ["--chunked-prefill-size", "32"]
def run_mmlu(
self, disable_radix_cache, enable_mixed_chunk, chunked_prefill_size=32
):
other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
if disable_radix_cache:
other_args += ["--disable-radix-cache"]
@@ -55,6 +57,11 @@ class TestChunkedPrefill(unittest.TestCase):
def test_mixed_chunked_prefill_without_radix_cache(self):
self.run_mmlu(disable_radix_cache=True, enable_mixed_chunk=True)
def test_no_chunked_prefill(self):
self.run_mmlu(
disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1
)
if __name__ == "__main__":
unittest.main()