Re-organize CI tests (#1052)
This commit is contained in:
5
.github/workflows/e2e-test.yml
vendored
5
.github/workflows/e2e-test.yml
vendored
@@ -45,8 +45,7 @@ jobs:
|
|||||||
cd test/srt
|
cd test/srt
|
||||||
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
|
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
|
||||||
|
|
||||||
- name: Benchmark Serving Throughput (w/o FlashInfer)
|
- name: Benchmark Serving Throughput (w/ ChunkedPrefill)
|
||||||
run: |
|
run: |
|
||||||
cd test/srt
|
cd test/srt
|
||||||
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer
|
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class BaseToolCache:
|
|||||||
return val
|
return val
|
||||||
|
|
||||||
def init_value(self, key):
|
def init_value(self, key):
|
||||||
raise NotImplementedError
|
raise NotImplementedError()
|
||||||
|
|
||||||
def get_cache_hit_rate(self):
|
def get_cache_hit_rate(self):
|
||||||
if self.metrics["total"] == 0:
|
if self.metrics["total"] == 0:
|
||||||
|
|||||||
@@ -410,6 +410,7 @@ class ModelTpServer:
|
|||||||
|
|
||||||
# Print stats
|
# Print stats
|
||||||
if self.tp_rank == 0:
|
if self.tp_rank == 0:
|
||||||
|
if isinstance(self.tree_cache, RadixCache):
|
||||||
self.tree_cache_metrics["total"] += (
|
self.tree_cache_metrics["total"] += (
|
||||||
adder.log_input_tokens + adder.log_hit_tokens
|
adder.log_input_tokens + adder.log_hit_tokens
|
||||||
) / 10**9
|
) / 10**9
|
||||||
@@ -417,6 +418,8 @@ class ModelTpServer:
|
|||||||
tree_cache_hit_rate = (
|
tree_cache_hit_rate = (
|
||||||
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
tree_cache_hit_rate = 0.0
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[gpu={self.gpu_id}] Prefill batch. "
|
f"[gpu={self.gpu_id}] Prefill batch. "
|
||||||
f"#new-seq: {len(can_run_list)}, "
|
f"#new-seq: {len(can_run_list)}, "
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ class ChunkCache(BasePrefixCache):
|
|||||||
req.last_node = entry
|
req.last_node = entry
|
||||||
|
|
||||||
def insert(self):
|
def insert(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError()
|
||||||
|
|
||||||
def evict(self, num_tokens: int, evict_callback: Callable):
|
def evict(self, num_tokens: int, evict_callback: Callable):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -447,6 +447,15 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
|
|||||||
print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
|
print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Print warnings here
|
||||||
|
if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None:
|
||||||
|
logger.warning(
|
||||||
|
"You set both `--disable-radix-cache` and `--chunked-prefill-size`. "
|
||||||
|
"This combination is an experimental feature and we noticed it can lead to "
|
||||||
|
"wrong generation results. If you want to use chunked prefill, it is recommended "
|
||||||
|
"not using `--disable-radix-cache`."
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("The server is fired up and ready to roll!")
|
logger.info("The server is fired up and ready to roll!")
|
||||||
if pipe_finish_writer is not None:
|
if pipe_finish_writer is not None:
|
||||||
pipe_finish_writer.send("init ok")
|
pipe_finish_writer.send("init ok")
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ suites = {
|
|||||||
"test_openai_server.py",
|
"test_openai_server.py",
|
||||||
"test_skip_tokenizer_init.py",
|
"test_skip_tokenizer_init.py",
|
||||||
"test_torch_compile.py",
|
"test_torch_compile.py",
|
||||||
|
"test_triton_attn_backend.py",
|
||||||
"test_vision_openai_server.py",
|
"test_vision_openai_server.py",
|
||||||
"test_large_max_new_tokens.py",
|
"test_large_max_new_tokens.py",
|
||||||
"models/test_generation_models.py",
|
"models/test_generation_models.py",
|
||||||
|
|||||||
68
test/srt/test_eval_accuracy_large_chunked_prefill.py
Normal file
68
test/srt/test_eval_accuracy_large_chunked_prefill.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
import unittest
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
from sglang.srt.utils import kill_child_process
|
||||||
|
from sglang.test.run_eval import run_eval
|
||||||
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||||
|
DEFAULT_URL_FOR_TEST,
|
||||||
|
popen_launch_server,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
|
cls.base_url = "http://127.0.0.1:7157"
|
||||||
|
cls.process = popen_launch_server(
|
||||||
|
cls.model,
|
||||||
|
cls.base_url,
|
||||||
|
timeout=300,
|
||||||
|
other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
kill_child_process(cls.process.pid)
|
||||||
|
|
||||||
|
def test_mmlu(self):
|
||||||
|
args = SimpleNamespace(
|
||||||
|
base_url=self.base_url,
|
||||||
|
model=self.model,
|
||||||
|
eval_name="mmlu",
|
||||||
|
num_examples=3000,
|
||||||
|
num_threads=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = run_eval(args)
|
||||||
|
assert metrics["score"] >= 0.71, f"{metrics}"
|
||||||
|
|
||||||
|
def test_human_eval(self):
|
||||||
|
args = SimpleNamespace(
|
||||||
|
base_url=self.base_url,
|
||||||
|
model=self.model,
|
||||||
|
eval_name="humaneval",
|
||||||
|
num_examples=None,
|
||||||
|
num_threads=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = run_eval(args)
|
||||||
|
assert metrics["score"] >= 0.65, f"{metrics}"
|
||||||
|
|
||||||
|
def test_mgsm_en(self):
|
||||||
|
args = SimpleNamespace(
|
||||||
|
base_url=self.base_url,
|
||||||
|
model=self.model,
|
||||||
|
eval_name="mgsm_en",
|
||||||
|
num_examples=None,
|
||||||
|
num_threads=1024,
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = run_eval(args)
|
||||||
|
assert metrics["score"] >= 0.85, f"{metrics}"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -3,6 +3,7 @@ import unittest
|
|||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
|
|
||||||
from sglang.bench_serving import run_benchmark
|
from sglang.bench_serving import run_benchmark
|
||||||
|
from sglang.srt.server_args import ServerArgs
|
||||||
from sglang.srt.utils import kill_child_process
|
from sglang.srt.utils import kill_child_process
|
||||||
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
|
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
|
||||||
|
|
||||||
@@ -60,9 +61,9 @@ class TestServingThroughput(unittest.TestCase):
|
|||||||
|
|
||||||
def test_default(self):
|
def test_default(self):
|
||||||
res = self.run_test(
|
res = self.run_test(
|
||||||
disable_radix_cache=False,
|
disable_radix_cache=ServerArgs.disable_radix_cache,
|
||||||
disable_flashinfer=False,
|
disable_flashinfer=ServerArgs.disable_flashinfer,
|
||||||
chunked_prefill_size=-1,
|
chunked_prefill_size=ServerArgs.chunked_prefill_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
||||||
@@ -72,21 +73,25 @@ class TestServingThroughput(unittest.TestCase):
|
|||||||
def test_default_without_radix_cache(self):
|
def test_default_without_radix_cache(self):
|
||||||
res = self.run_test(
|
res = self.run_test(
|
||||||
disable_radix_cache=True,
|
disable_radix_cache=True,
|
||||||
disable_flashinfer=False,
|
disable_flashinfer=ServerArgs.disable_flashinfer,
|
||||||
chunked_prefill_size=-1,
|
chunked_prefill_size=ServerArgs.chunked_prefill_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
||||||
# A100 (PCIE) performance
|
# A100 (PCIE) performance
|
||||||
assert res["output_throughput"] >= 1450
|
assert res["output_throughput"] >= 1450
|
||||||
|
|
||||||
def test_default_without_flashinfer(self):
|
def test_default_with_chunked_prefill(self):
|
||||||
self.run_test(
|
res = self.run_test(
|
||||||
disable_radix_cache=False,
|
disable_radix_cache=ServerArgs.disable_radix_cache,
|
||||||
disable_flashinfer=True,
|
disable_flashinfer=ServerArgs.disable_flashinfer,
|
||||||
chunked_prefill_size=-1,
|
chunked_prefill_size=8192,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
||||||
|
# A100 (PCIE) performance
|
||||||
|
assert res["output_throughput"] >= 1400
|
||||||
|
|
||||||
def test_all_cases(self):
|
def test_all_cases(self):
|
||||||
for disable_radix_cache in [False, True]:
|
for disable_radix_cache in [False, True]:
|
||||||
for disable_flashinfer in [False, True]:
|
for disable_flashinfer in [False, True]:
|
||||||
|
|||||||
41
test/srt/test_triton_attn_backend.py
Normal file
41
test/srt/test_triton_attn_backend.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import unittest
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
from sglang.srt.utils import kill_child_process
|
||||||
|
from sglang.test.run_eval import run_eval
|
||||||
|
from sglang.test.test_utils import (
|
||||||
|
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||||
|
DEFAULT_URL_FOR_TEST,
|
||||||
|
popen_launch_server,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTritonAttnBackend(unittest.TestCase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||||
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||||
|
cls.process = popen_launch_server(
|
||||||
|
cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"]
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
kill_child_process(cls.process.pid)
|
||||||
|
|
||||||
|
def test_mmlu(self):
|
||||||
|
args = SimpleNamespace(
|
||||||
|
base_url=self.base_url,
|
||||||
|
model=self.model,
|
||||||
|
eval_name="mmlu",
|
||||||
|
num_examples=32,
|
||||||
|
num_threads=32,
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = run_eval(args)
|
||||||
|
assert metrics["score"] >= 0.6
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user