[CI] Add more multi-gpu tests (#1280)

This commit is contained in:
Lianmin Zheng
2024-09-01 00:27:25 -07:00
committed by GitHub
parent d134c139a1
commit 1b5d56f7f8
11 changed files with 271 additions and 128 deletions

View File

@@ -0,0 +1,73 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestEvalAccuracyLarge(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--log-level-http",
"warning",
"--tp",
"2",
],
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=3000,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.63, f"{metrics}"
def test_human_eval(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="humaneval",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.43, f"{metrics}"
def test_mgsm_en(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.64, f"{metrics}"
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,45 @@
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()

View File

@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
other_args.append("--disable-flashinfer")
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
other_args.extend(["--tensor-parallel-size", "2"])
other_args.append("--enable-p2p-check")
model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
)
# Run benchmark
num_prompts = 200
num_prompts = 300
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1800
assert res["output_throughput"] > 1750
assert res["output_throughput"] > 1850
def test_default_without_radix_cache(self):
res = self.run_test(
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1900
assert res["output_throughput"] > 1850
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
assert res["output_throughput"] > 1950
if __name__ == "__main__":

View File

@@ -0,0 +1,43 @@
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()

View File

@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
)
# Run benchmark
num_prompts = 400
num_prompts = 500
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550
assert res["output_throughput"] > 2500
assert res["output_throughput"] > 2400
def test_default_without_radix_cache(self):
res = self.run_test(
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1500, H100 (SMX): 2850
assert res["output_throughput"] > 2800
def test_default_without_chunked_prefill(self):
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550
assert res["output_throughput"] > 2500
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
assert res["output_throughput"] > 2400
if __name__ == "__main__":